I have a pandas dataframe (df)
Parameter Date Concentration Unit Prescribed Standard Exceeding Standard? (Yes/No) Remarks
1 NaN 02/01/2017 26.10 µg/m3 NaN NaN NaN
2 NaN 03/01/2017 30.27 µg/m3 NaN NaN NaN
3 NaN 04/01/2017 36.36 µg/m3 NaN NaN NaN
4 NaN 05/01/2017 33.41 µg/m3 NaN NaN NaN
I am trying to plot the timeseries plot of Date vs Concentration
I converted the dates to datetime object using .to_datetime()
Parameter Date Concentration Unit Prescribed Standard Exceeding Standard? (Yes/No) Remarks
1 NaN 2017-01-02 26.10 µg/m3 NaN NaN NaN
2 NaN 2017-01-03 30.27 µg/m3 NaN NaN NaN
3 NaN 2017-01-04 36.36 µg/m3 NaN NaN NaN
4 NaN 2017-01-05 33.41 µg/m3 NaN NaN NaN
The time series plot is easily done using:
plt.plot(df["Date"][:322], df["Concentration"][:322], "+", color="red", linewidth=0.5)
But if I try to change the xlabels to anything else, say month names, plt.xticks(["Jan", "Feb","Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sept", "Oct", "Nov", "Dec"])]) I get an error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
~/.local/lib/python3.6/site-packages/pandas/core/tools/datetimes.py in _convert_listlike(arg, box, format, name, tz)
302 try:
--> 303 values, tz = tslib.datetime_to_datetime64(arg)
304 return DatetimeIndex._simple_new(values, name=name, tz=tz)
pandas/_libs/tslib.pyx in pandas._libs.tslib.datetime_to_datetime64()
TypeError: Unrecognized value type: <class 'str'>
During handling of the above exception, another exception occurred:
OutOfBoundsDatetime Traceback (most recent call last)
~/.local/lib/python3.6/site-packages/pandas/plotting/_converter.py in _convert_1d(values, unit, axis)
310 try:
--> 311 values = tools.to_datetime(values)
312 if isinstance(values, Index):
~/.local/lib/python3.6/site-packages/pandas/core/tools/datetimes.py in to_datetime(arg, errors, dayfirst, yearfirst, utc, box, format, exact, unit, infer_datetime_format, origin)
379 elif is_list_like(arg):
--> 380 result = _convert_listlike(arg, box, format)
381 else:
~/.local/lib/python3.6/site-packages/pandas/core/tools/datetimes.py in _convert_listlike(arg, box, format, name, tz)
305 except (ValueError, TypeError):
--> 306 raise e
307
~/.local/lib/python3.6/site-packages/pandas/core/tools/datetimes.py in _convert_listlike(arg, box, format, name, tz)
293 yearfirst=yearfirst,
--> 294 require_iso8601=require_iso8601
295 )
pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime()
pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime()
pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime()
pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime()
pandas/_libs/tslib.pyx in pandas._libs.tslib.convert_datetime_to_tsobject()
pandas/_libs/tslib.pyx in pandas._libs.tslib._check_dts_bounds()
OutOfBoundsDatetime: Out of bounds nanosecond timestamp: 1-01-01 00:00:00
During handling of the above exception, another exception occurred:
AttributeError Traceback (most recent call last)
<ipython-input-26-7f51ae7566c4> in <module>()
4
5 plt.plot(df_btm_pm25["Date"][:322], df_btm_pm25["Concentration"][:322], "+", color="red", linewidth=0.5)
----> 6 plt.xticks(["Jan", "Feb","Mar", "Apr", "May", "Jun", "Jul"])#, "Aug", "Sept", "Oct", "Nov", "Dec"])
7 #axes.set_xticks(x)
8 axes.locator_params(nbins=12)
~/.local/lib/python3.6/site-packages/matplotlib/pyplot.py in xticks(*args, **kwargs)
1722 labels = ax.get_xticklabels()
1723 elif len(args)==1:
-> 1724 locs = ax.set_xticks(args[0])
1725 labels = ax.get_xticklabels()
1726 elif len(args)==2:
~/.local/lib/python3.6/site-packages/matplotlib/axes/_base.py in set_xticks(self, ticks, minor)
3206 Default is ``False``.
3207 """
-> 3208 ret = self.xaxis.set_ticks(ticks, minor=minor)
3209 self.stale = True
3210 return ret
~/.local/lib/python3.6/site-packages/matplotlib/axis.py in set_ticks(self, ticks, minor)
1676 """
1677 # XXX if the user changes units, the information will be lost here
-> 1678 ticks = self.convert_units(ticks)
1679 if len(ticks) > 1:
1680 xleft, xright = self.get_view_interval()
~/.local/lib/python3.6/site-packages/matplotlib/axis.py in convert_units(self, x)
1524 return x
1525
-> 1526 ret = self.converter.convert(x, self.units, self)
1527 return ret
1528
~/.local/lib/python3.6/site-packages/pandas/plotting/_converter.py in convert(values, unit, axis)
278 for v in values]
279 else:
--> 280 values = DatetimeConverter._convert_1d(values, unit, axis)
281 return values
282
~/.local/lib/python3.6/site-packages/pandas/plotting/_converter.py in _convert_1d(values, unit, axis)
315 values = [_dt_to_float_ordinal(x) for x in values]
316 except Exception:
--> 317 values = _dt_to_float_ordinal(values)
318
319 return values
~/.local/lib/python3.6/site-packages/pandas/plotting/_converter.py in _dt_to_float_ordinal(dt)
263 base = dates.epoch2num(dt.asi8 / 1.0E9)
264 else:
--> 265 base = dates.date2num(dt)
266 return base
267
~/.local/lib/python3.6/site-packages/matplotlib/dates.py in date2num(d)
450 if not d.size:
451 return d
--> 452 return _to_ordinalf_np_vectorized(d)
453
454
~/.local/lib/python3.6/site-packages/numpy/lib/function_base.py in __call__(self, *args, **kwargs)
2753 vargs.extend([kwargs[_n] for _n in names])
2754
-> 2755 return self._vectorize_call(func=func, args=vargs)
2756
2757 def _get_ufunc_and_otypes(self, func, args):
~/.local/lib/python3.6/site-packages/numpy/lib/function_base.py in _vectorize_call(self, func, args)
2823 res = func()
2824 else:
-> 2825 ufunc, otypes = self._get_ufunc_and_otypes(func=func, args=args)
2826
2827 # Convert args to object arrays first
~/.local/lib/python3.6/site-packages/numpy/lib/function_base.py in _get_ufunc_and_otypes(self, func, args)
2783
2784 inputs = [arg.flat[0] for arg in args]
-> 2785 outputs = func(*inputs)
2786
2787 # Performance note: profiling indicates that -- for simple
~/.local/lib/python3.6/site-packages/matplotlib/dates.py in _to_ordinalf(dt)
253 tzi = UTC
254
--> 255 base = float(dt.toordinal())
256
257 # If it's sufficiently datetime-like, it will have a `date()` method
AttributeError: 'str' object has no attribute 'toordinal'
Although, I can change the ticks by set_xticks. What am I missing here?
As #ImportanceOfBeingErnest suggested, you should use locators to reformat the xtick labels. I have implemented them below:
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.dates as mdates # For formatting dates
fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(15, 4.18))
# Make the original plot
ax[0].plot(df["Date"][:322],
df["Concentration"][:322],
"+", color="red", linewidth=0.5)
ax[0].set_title('Original plot')
# New xticks plot
months = mdates.MonthLocator() # Add tick every month
days = mdates.DayLocator(range(1,32,5)) # Add tick every 5th day in a month
monthFmt = mdates.DateFormatter('%b') # Use abbreviated month name
# Add the locators to the axis
ax[1].xaxis.set_major_locator(months)
ax[1].xaxis.set_major_formatter(monthFmt)
ax[1].xaxis.set_minor_locator(days)
ax[1].plot(df["Date"][:322],
df["Concentration"][:322],
"+", color="red", linewidth=0.5)
ax[1].set_title('Updated xticks')
plt.show()
Here are some helpful resources:
The matplotlib.dates api
strftime() directives
This example, which I drew from heavily
Related
df:
avg
count
date
val
prop
unit
distance
d-atmp
d-clouds
d-dewpoint
0.0786107
12
2014-10-03 00:00:00
22
atmp
(Deg C)
24829.6
24829.6
nan
nan
0.0786107
12
2014-10-03 00:00:00
0
clouds
(oktas)
22000.6
nan
22000.6
nan
0.0786107
12
2014-10-03 00:00:00
32
dewpoint
(Deg C)
21344.1
nan
nan
21344.1
0.0684246
6
2014-10-04 00:00:00
21.5
atmp
(Deg C)
26345.1
26345.1
nan
nan
cols = ['avg', 'date', 'count', 'd-atmp', 'd-cloud', 'd-dewpoint']
d = pd.pivot_table(x, index=cols, columns=['prop', 'unit'], values='val', aggfunc=max)
Ideal result:
date
countObs
avg
d-atmp
atmp (Deg C)
d-clouds
clouds (oktas)
d-dewpoint
dewpoint (Deg C)
2014-10-03 00:00:00
12
0.0786107
24829.6
22
22000.6
0
21344.1
32
2014-10-04 00:00:00
6
0.0684246
26345.1
21.5
nan
nan
nan
nan
Error
--------------------------------------------------------------------------- NotImplementedError Traceback (most recent call last) ~/.local/lib/python3.9/site-packages/pandas/core/groupby/generic.py in array_func(values) 1067 try:
-> 1068 result = self.grouper._cython_operation( 1069 "aggregate", values, how, axis=data.ndim - 1, min_count=min_count
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in
_cython_operation(self, kind, values, how, axis, min_count, **kwargs)
998 ngroups = self.ngroups
--> 999 return cy_op.cython_operation( 1000 values=values,
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in cython_operation(self, values, axis, min_count, comp_ids, ngroups,
**kwargs)
659
--> 660 return self._cython_op_ndim_compat(
661 values,
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in
_cython_op_ndim_compat(self, values, min_count, ngroups, comp_ids, mask, **kwargs)
515
--> 516 return self._call_cython_op(
517 values,
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in
_call_cython_op(self, values, min_count, ngroups, comp_ids, mask, **kwargs)
561 out_shape = self._get_output_shape(ngroups, values)
--> 562 func, values = self.get_cython_func_and_vals(values, is_numeric)
563 out_dtype = self.get_out_dtype(values.dtype)
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in get_cython_func_and_vals(self, values, is_numeric)
204
--> 205 func = self._get_cython_function(kind, how, values.dtype, is_numeric)
206
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in
_get_cython_function(cls, kind, how, dtype, is_numeric)
169 # raise NotImplementedError here rather than TypeError later
--> 170 raise NotImplementedError(
171 f"function is not implemented for this dtype: "
NotImplementedError: function is not implemented for this dtype: [how->mean,dtype->object]
During handling of the above exception, another exception occurred:
AssertionError Traceback (most recent call last) <ipython-input-119-b64b487d2810> in <module>
5 # o
6 # cols += []
----> 7 d = pd.pivot_table(x, index=cols, columns=['osmcObsProperty', 'unit'], values='val') #, aggfunc=max #np.mean or max appear similar , dropna=False
8
9 d.reset_index(inplace=True)
~/.local/lib/python3.9/site-packages/pandas/core/reshape/pivot.py in pivot_table(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed, sort)
93 return table.__finalize__(data, method="pivot_table")
94
---> 95 table = __internal_pivot_table(
96 data,
97 values,
~/.local/lib/python3.9/site-packages/pandas/core/reshape/pivot.py in
__internal_pivot_table(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed, sort)
163
164 grouped = data.groupby(keys, observed=observed, sort=sort)
--> 165 agged = grouped.agg(aggfunc)
166 if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
167 agged = agged.dropna(how="all")
~/.local/lib/python3.9/site-packages/pandas/core/groupby/generic.py in aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
977
978 op = GroupByApply(self, func, args, kwargs)
--> 979 result = op.agg()
980 if not is_dict_like(func) and result is not None:
981 return result
~/.local/lib/python3.9/site-packages/pandas/core/apply.py in agg(self)
156
157 if isinstance(arg, str):
--> 158 return self.apply_str()
159
160 if is_dict_like(arg):
~/.local/lib/python3.9/site-packages/pandas/core/apply.py in apply_str(self)
505 elif self.axis != 0:
506 raise ValueError(f"Operation {f} does not support axis=1")
--> 507 return self._try_aggregate_string_function(obj, f, *self.args, **self.kwargs)
508
509 def apply_multiple(self) -> FrameOrSeriesUnion:
~/.local/lib/python3.9/site-packages/pandas/core/apply.py in
_try_aggregate_string_function(self, obj, arg, *args, **kwargs)
575 if f is not None:
576 if callable(f):
--> 577 return f(*args, **kwargs)
578
579 # people may try to aggregate on a non-callable attribute
~/.local/lib/python3.9/site-packages/pandas/core/groupby/groupby.py in mean(self, numeric_only) 1685 numeric_only = self._resolve_numeric_only(numeric_only) 1686
-> 1687 result = self._cython_agg_general( 1688 "mean", 1689 alt=lambda x: Series(x).mean(numeric_only=numeric_only),
~/.local/lib/python3.9/site-packages/pandas/core/groupby/generic.py in
_cython_agg_general(self, how, alt, numeric_only, min_count) 1080 # TypeError -> we may have an exception in trying to aggregate 1081 # continue and exclude the block
-> 1082 new_mgr = data.grouped_reduce(array_func, ignore_failures=True) 1083 1084 if len(new_mgr) < len(data):
~/.local/lib/python3.9/site-packages/pandas/core/internals/managers.py in grouped_reduce(self, func, ignore_failures) 1233 for sb in blk._split(): 1234 try:
-> 1235 applied = sb.apply(func) 1236 except (TypeError, NotImplementedError): 1237 if not ignore_failures:
~/.local/lib/python3.9/site-packages/pandas/core/internals/blocks.py in apply(self, func, **kwargs)
379 """
380 with np.errstate(all="ignore"):
--> 381 result = func(self.values, **kwargs)
382
383 return self._split_op_result(result)
~/.local/lib/python3.9/site-packages/pandas/core/groupby/generic.py in array_func(values) 1074 # try to python agg 1075
# TODO: shouldn't min_count matter?
-> 1076 result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt) 1077 1078 return result
~/.local/lib/python3.9/site-packages/pandas/core/groupby/groupby.py in
_agg_py_fallback(self, values, ndim, alt) 1396 # should always be preserved by the implemented aggregations 1397 # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype?
-> 1398 res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True) 1399 1400 if isinstance(values, Categorical):
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in agg_series(self, obj, func, preserve_dtype) 1047 1048 else:
-> 1049 result = self._aggregate_series_fast(obj, func) 1050 1051 npvalues = lib.maybe_convert_objects(result, try_float=False)
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in
_aggregate_series_fast(self, obj, func) 1072 ids = ids.take(indexer) 1073 sgrouper = libreduction.SeriesGrouper(obj, func, ids, ngroups)
-> 1074 result, _ = sgrouper.get_result() 1075 return result 1076
~/.local/lib/python3.9/site-packages/pandas/_libs/reduction.pyx in pandas._libs.reduction.SeriesGrouper.get_result()
AssertionError: `result` has not been initialized.
IIUC, you can use groupby_agg:
out = df.groupby('date', as_index=False).agg(max)
Output:
date
avg
count
val
prop
unit
distance
d-atmp
d-clouds
d-dewpoint
2014-10-03 00:00:00
0.0786107
12
32
dewpoint
(oktas)
24829.6
24829.6
22000.6
21344.1
2014-10-04 00:00:00
0.0684246
6
21.5
atmp
(Deg C)
26345.1
26345.1
nan
nan
You could pivot; then use groupby + max:
cols = ['avg', 'date', 'count', 'd-atmp', 'd-clouds', 'd-dewpoint']
tmp = df.pivot(index=cols, columns=['prop', 'unit'], values='val')
tmp.columns = tmp.columns.map(' '.join)
out = tmp.reset_index().groupby('date', as_index=False).max()\
[['date', 'count', 'avg', 'd-atmp', 'atmp (Deg C)', 'd-clouds',
'clouds (oktas)', 'd-dewpoint', 'dewpoint (Deg C)']]
Output:
date count avg d-atmp atmp (Deg C) d-clouds clouds (oktas) d-dewpoint dewpoint (Deg C)
0 2014-10-03 00:00:00 12 0.078611 24829.6 22.0 22000.6 0.0 21344.1 32.0
1 2014-10-04 00:00:00 6 0.068425 26345.1 21.5 NaN NaN NaN NaN
I would like to know why the following code doesn't work ? Indeed I would like to have a rolling annualized risk mesure of the SP500. First I've imported data from yahoo finance. I've selected the close price and taken the last close price of eache month. Then I wanted the return of each date with the 36 close month period back. Same with risk metrics.
import pandas as pd
import numpy as np
import yfinance as yf
SP500=yf.download("SPY", start = "2020-01-01", end = "2020-12-01")
close = SP500.loc[:, "Close"].copy()
month_ret = close.resample("M", kind = "period").last().pct_change().dropna()
month_ret["Return"] = month_ret.rolling(36).mean()*12
month_ret["Risk"] = month_ret.rolling(36).std()*np.sqrt(12)
month_ret.tail()
I've got the following message:
ValueError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\pandas\core\window.py in _prep_values(self, values)
231 try:
--> 232 values = ensure_float64(values)
233 except (ValueError, TypeError):
pandas\_libs\algos_common_helper.pxi in pandas._libs.algos.ensure_float64()
ValueError: setting an array element with a sequence.
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\pandas\core\window.py in _apply(self, func, name, window, center, check_minp, **kwargs)
911 try:
--> 912 values = self._prep_values(b.values)
913
~\Anaconda3\lib\site-packages\pandas\core\window.py in _prep_values(self, values)
234 raise TypeError(
--> 235 "cannot handle this type -> {0}" "".format(values.dtype)
236 )
TypeError: cannot handle this type -> object
During handling of the above exception, another exception occurred:
DataError Traceback (most recent call last)
<ipython-input-18-6480c4316ca2> in <module>
----> 1 month_ret["Risk"] = month_ret.rolling(36).std()*np.sqrt(12)
~\Anaconda3\lib\site-packages\pandas\core\window.py in std(self, ddof, *args, **kwargs)
1886 def std(self, ddof=1, *args, **kwargs):
1887 nv.validate_rolling_func("std", args, kwargs)
-> 1888 return super().std(ddof=ddof, **kwargs)
1889
1890 #Substitution(name="rolling")
~\Anaconda3\lib\site-packages\pandas\core\window.py in std(self, ddof, *args, **kwargs)
1292
1293 return self._apply(
-> 1294 f, "std", check_minp=_require_min_periods(1), ddof=ddof, **kwargs
1295 )
1296
~\Anaconda3\lib\site-packages\pandas\core\window.py in _apply(self, func, name, window, center, check_minp, **kwargs)
918 continue
919 else:
--> 920 raise DataError("No numeric types to aggregate")
921
922 if values.size == 0:
DataError: No numeric types to aggregate
There are two causes of the error The second is that the calculation of the risk value is not specified in the return column, which leads to the error that the NaN value is included in the calculation. Since I am not familiar with investments, does the following code meet the intent of your question?
import pandas as pd
import numpy as np
import yfinance as yf
SP500=yf.download("SPY", start = "2015-01-01", end = "2020-12-01")
close = SP500.loc[:, ["Close"]].copy()
month_ret = close.resample("M", kind="period").last().pct_change().dropna()
month_ret["Return"] = month_ret.rolling(36).mean()*12
month_ret.fillna(0, inplace=True)
month_ret["Risk"] = month_ret['Return'].rolling(36).std()*np.sqrt(12)
month_ret.tail(37)
Close Return Risk
Date
2017-11 0.030566 0.000000 NaN
2017-12 0.006981 0.092355 0.053321
2018-01 0.056359 0.121018 0.086672
2018-02 -0.036360 0.090163 0.099274
2018-03 -0.031290 0.086426 0.108837
2018-04 0.005168 0.084871 0.116688
2018-05 0.024309 0.088689 0.124241
2018-06 0.001255 0.097458 0.132499
2018-07 0.037047 0.102278 0.140553
2018-08 0.031920 0.133234 0.154062
2018-09 0.001412 0.143890 0.167868
2018-10 -0.069104 0.092502 0.170876
2018-11 0.018549 0.097467 0.173960
2018-12 -0.093343 0.074052 0.174226
2019-01 0.080066 0.117336 0.178965
2019-02 0.032416 0.128416 0.184521
2019-03 0.013636 0.112365 0.186919
2019-04 0.040852 0.124669 0.190188
2019-05 -0.063771 0.097741 0.189812
2019-06 0.064410 0.119783 0.191078
2019-07 0.015119 0.112665 0.190892
2019-08 -0.016743 0.106685 0.189555
2019-09 0.014772 0.113265 0.188173
2019-10 0.022105 0.126412 0.187371
2019-11 0.036198 0.126199 0.185726
2019-12 0.024021 0.129441 0.183558
2020-01 -0.000404 0.123342 0.179933
2020-02 -0.079166 0.083856 0.173722
2020-03 -0.129987 0.041556 0.168637
2020-04 0.126984 0.080575 0.161391
2020-05 0.047645 0.091753 0.153379
2020-06 0.013275 0.095681 0.144406
2020-07 0.058892 0.108460 0.134495
2020-08 0.069797 0.130753 0.124329
2020-09 -0.041281 0.111955 0.110565
2020-10 -0.024934 0.095790 0.093477
2020-11 0.108777 0.121860 0.071686
I am trying to solve a very simple problem, but am running into a wall.
I have a DateTimeIndex based on a simple dataframe like follows:
df=pd.DataFrame(
index=pd.date_range(
start='2017-01-01',
end='2017-03-04', closed=None),
data=np.arange(63), columns=['val']).rename_axis(index='date')
In [179]: df
Out[179]:
val
date
2017-01-01 0
2017-01-02 1
2017-01-03 2
2017-01-04 3
2017-01-05 4
... ...
2017-02-28 58
2017-03-01 59
2017-03-02 60
2017-03-03 61
2017-03-04 62
[63 rows x 1 columns]
I wish to summarize the values by periods of weekly, semi-monthly, monthly etc.
So I tried:
In [180]: df.to_period('W').groupby('date').sum()
Out[180]:
val
date
2016-12-26/2017-01-01 0
2017-01-02/2017-01-08 28
2017-01-09/2017-01-15 77
2017-01-16/2017-01-22 126
2017-01-23/2017-01-29 175
2017-01-30/2017-02-05 224
2017-02-06/2017-02-12 273
2017-02-13/2017-02-19 322
2017-02-20/2017-02-26 371
2017-02-27/2017-03-05 357
That works fine for offset aliases like Y, M, D, W, T, S, L, U, N.
But fails for SM, SMS and others listed here: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
It raises a ValueError exception:
In [181]: df.to_period('SMS').groupby('date').sum()
--------------------------------------------------------------------------- KeyError Traceback (most recent call
last) pandas/_libs/tslibs/frequencies.pyx in
pandas._libs.tslibs.frequencies._period_str_to_code()
KeyError: 'SMS-15'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call
last) <ipython-input-181-6779559a0596> in <module>
----> 1 df.to_period('SMS').groupby('date').sum()
~/.virtualenvs/py36/lib/python3.6/site-packages/pandas/core/frame.py
in to_period(self, freq, axis, copy) 8350 axis =
self._get_axis_number(axis) 8351 if axis == 0:
-> 8352 new_data.set_axis(1, self.index.to_period(freq=freq)) 8353 elif axis == 1:
8354 new_data.set_axis(0,
self.columns.to_period(freq=freq))
~/.virtualenvs/py36/lib/python3.6/site-packages/pandas/core/accessor.py
in f(self, *args, **kwargs)
91 def _create_delegator_method(name):
92 def f(self, *args, **kwargs):
---> 93 return self._delegate_method(name, *args, **kwargs)
94
95 f.__name__ = name
~/.virtualenvs/py36/lib/python3.6/site-packages/pandas/core/indexes/datetimelike.py
in _delegate_method(self, name, *args, **kwargs)
811
812 def _delegate_method(self, name, *args, **kwargs):
--> 813 result = operator.methodcaller(name, *args, **kwargs)(self._data)
814 if name not in self._raw_methods:
815 result = Index(result, name=self.name)
~/.virtualenvs/py36/lib/python3.6/site-packages/pandas/core/arrays/datetimes.py
in to_period(self, freq) 1280 freq =
get_period_alias(freq) 1281
-> 1282 return PeriodArray._from_datetime64(self._data, freq, tz=self.tz) 1283 1284 def to_perioddelta(self, freq):
~/.virtualenvs/py36/lib/python3.6/site-packages/pandas/core/arrays/period.py
in _from_datetime64(cls, data, freq, tz)
273 PeriodArray[freq]
274 """
--> 275 data, freq = dt64arr_to_periodarr(data, freq, tz)
276 return cls(data, freq=freq)
277
~/.virtualenvs/py36/lib/python3.6/site-packages/pandas/core/arrays/period.py
in dt64arr_to_periodarr(data, freq, tz)
914 data = data._values
915
--> 916 base, mult = libfrequencies.get_freq_code(freq)
917 return libperiod.dt64arr_to_periodarr(data.view("i8"), base, tz), freq
918
pandas/_libs/tslibs/frequencies.pyx in
pandas._libs.tslibs.frequencies.get_freq_code()
pandas/_libs/tslibs/frequencies.pyx in
pandas._libs.tslibs.frequencies.get_freq_code()
pandas/_libs/tslibs/frequencies.pyx in
pandas._libs.tslibs.frequencies.get_freq_code()
pandas/_libs/tslibs/frequencies.pyx in
pandas._libs.tslibs.frequencies._period_str_to_code()
ValueError: Invalid frequency: SMS-15
I am using python 3.6.5, pandas version '0.25.1'
Use DataFrame.resample here:
print (df.resample('W').sum())
val
date
2017-01-01 0
2017-01-08 28
2017-01-15 77
2017-01-22 126
2017-01-29 175
2017-02-05 224
2017-02-12 273
2017-02-19 322
2017-02-26 371
2017-03-05 357
print (df.resample('SM').sum())
val
date
2016-12-31 91
2017-01-15 344
2017-01-31 555
2017-02-15 663
2017-02-28 300
print (df.resample('SMS').sum())
val
date
2017-01-01 91
2017-01-15 374
2017-02-01 525
2017-02-15 721
2017-03-01 242
Alternatives with groupby and Grouper:
print (df.groupby(pd.Grouper(freq='W')).sum())
print (df.groupby(pd.Grouper(freq='SM')).sum())
print (df.groupby(pd.Grouper(freq='SMS')).sum())
I am using read_sql_query from Pandas vesion 0.22.0 to pull time series data from a local PostgreSQL database. If I do not parse the date columns, then I get the following data frame:
dataid localminute use
0 1642 2012-05-11 19:00:00-05:00 0.827
1 1642 2012-05-11 19:01:00-05:00 0.830
2 1642 2012-05-11 19:02:00-05:00 0.833
3 1642 2012-05-11 19:03:00-05:00 0.835
4 1642 2012-05-11 19:04:00-05:00 0.837
the localminute column has dtype=object and contains a mix of object such as
datetime.datetime(2012, 5, 11, 19, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None))
datetime.datetime(2012, 12, 9, 2, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=-360, name=None))
note that these objects have different tzinfo.
If I try to pass parse_dates=["localminute"] when using pd.read_sql_query I get the following error and traceback.
--------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\AppData\Local\Continuum\Anaconda3\envs\pecanpy-dev\lib\site-packages\pandas\core\tools\datetimes.py in _convert_listlike(arg, box, format, name, tz)
302 try:
--> 303 values, tz = tslib.datetime_to_datetime64(arg)
304 return DatetimeIndex._simple_new(values, name=name, tz=tz)
pandas/_libs/tslib.pyx in pandas._libs.tslib.datetime_to_datetime64()
ValueError: Array must be all same time zone
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-8-0d092e8e183b> in <module>()
6 end_time = pd.Timestamp("2015-06-01", tz="US/Central")
7
----> 8 usage_df = pecanpy.read_electricity_egauge_minutes_query(local_con, "public", "all", dataid, start_time, end_time)
c:\users\pughdr\research\pecanpy\pecanpy\api.py in read_electricity_egauge_minutes_query(con, schema, columns, dataid, start_time, end_time, tz)
32 query = template.format(**kwargs)
33 #parse_dates= {"localminute": {}}
---> 34 df = pd.read_sql_query(query, con=con, parse_dates=["localminute"])
35
36 # if the time period of interest contains observations only from within the
~\AppData\Local\Continuum\Anaconda3\envs\pecanpy-dev\lib\site-packages\pandas\io\sql.py in read_sql_query(sql, con, index_col, coerce_float, params, parse_dates, chunksize)
330 return pandas_sql.read_query(
331 sql, index_col=index_col, params=params, coerce_float=coerce_float,
--> 332 parse_dates=parse_dates, chunksize=chunksize)
333
334
~\AppData\Local\Continuum\Anaconda3\envs\pecanpy-dev\lib\site-packages\pandas\io\sql.py in read_query(self, sql, index_col, coerce_float, parse_dates, params, chunksize)
1102 frame = _wrap_result(data, columns, index_col=index_col,
1103 coerce_float=coerce_float,
-> 1104 parse_dates=parse_dates)
1105 return frame
1106
~\AppData\Local\Continuum\Anaconda3\envs\pecanpy-dev\lib\site-packages\pandas\io\sql.py in _wrap_result(data, columns, index_col, coerce_float, parse_dates)
157 coerce_float=coerce_float)
158
--> 159 _parse_date_columns(frame, parse_dates)
160
161 if index_col is not None:
~\AppData\Local\Continuum\Anaconda3\envs\pecanpy-dev\lib\site-packages\pandas\io\sql.py in _parse_date_columns(data_frame, parse_dates)
138 except TypeError:
139 fmt = None
--> 140 data_frame[col_name] = _handle_date_column(df_col, format=fmt)
141
142 # we want to coerce datetime64_tz dtypes for now
~\AppData\Local\Continuum\Anaconda3\envs\pecanpy-dev\lib\site-packages\pandas\io\sql.py in _handle_date_column(col, utc, format)
117 .astype('datetime64[ns, UTC]'))
118 else:
--> 119 return to_datetime(col, errors='coerce', format=format, utc=utc)
120
121
~\AppData\Local\Continuum\Anaconda3\envs\pecanpy-dev\lib\site-packages\pandas\core\tools\datetimes.py in to_datetime(arg, errors, dayfirst, yearfirst, utc, box, format, exact, unit, infer_datetime_format, origin)
371 elif isinstance(arg, ABCSeries):
372 from pandas import Series
--> 373 values = _convert_listlike(arg._values, True, format)
374 result = Series(values, index=arg.index, name=arg.name)
375 elif isinstance(arg, (ABCDataFrame, MutableMapping)):
~\AppData\Local\Continuum\Anaconda3\envs\pecanpy-dev\lib\site-packages\pandas\core\tools\datetimes.py in _convert_listlike(arg, box, format, name, tz)
304 return DatetimeIndex._simple_new(values, name=name, tz=tz)
305 except (ValueError, TypeError):
--> 306 raise e
307
308 if arg is None:
~\AppData\Local\Continuum\Anaconda3\envs\pecanpy-dev\lib\site-packages\pandas\core\tools\datetimes.py in _convert_listlike(arg, box, format, name, tz)
292 dayfirst=dayfirst,
293 yearfirst=yearfirst,
--> 294 require_iso8601=require_iso8601
295 )
296
pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime()
pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime()
ValueError: Tz-aware datetime.datetime cannot be converted to datetime64 unless utc=True
The ValueError is being thrown because the timestamps have different time zones due to Daylight Savings Time (DST). I don't understand the source of the second error.
How can I use read_sql_query to load a DataFrame with a localminute column that is time zone aware with a "US/Central" time zone?
I have a potential pandas bug, or maybe I've just been staring at this too long. I have not had issues using xs on a multi index before. Code is bellow and I've verified that the error occurs on both Python 2 on pandas version 0.16.2 and Python 3 running pandas version 0.17.0:
In [1]:
import sys
if sys.version[0] == '2':
from StringIO import StringIO
if sys.version[0] == '3':
from io import StringIO
import pandas as pd
sstring = """\
m,p,tstep,value,jday,normed_value,datetime
6,407,0,1,564.5,5.75,1964-07-18 12:00:00
6,407,0,1,564.5,5.75,1964-07-18 12:00:00
7,407,0,1,564.5,5.75,1964-07-18 12:00:00
8,408,0,1,564.5,6.75,1964-07-18 12:00:00
"""
subset = pd.read_csv(StringIO(sstring),
index_col=['m', 'p'],
parse_dates=['datetime'])
subset.xs(6, level='m')
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-1-dbc4b09ce656> in <module>()
21 print(subset)
22
---> 23 subset.xs(6, level='m')
C:\Anaconda\lib\site-packages\pandas\core\generic.pyc in xs(self, key, axis, level, copy, drop_level)
1458
1459 result = self.ix[indexer]
-> 1460 setattr(result, result._get_axis_name(axis), new_ax)
1461 return result
1462
C:\Anaconda\lib\site-packages\pandas\core\generic.pyc in __setattr__(self, name, value)
2159 try:
2160 object.__getattribute__(self, name)
-> 2161 return object.__setattr__(self, name, value)
2162 except AttributeError:
2163 pass
pandas\src\properties.pyx in pandas.lib.AxisProperty.__set__ (pandas\lib.c:42548)()
C:\Anaconda\lib\site-packages\pandas\core\generic.pyc in _set_axis(self, axis, labels)
411
412 def _set_axis(self, axis, labels):
--> 413 self._data.set_axis(axis, labels)
414 self._clear_item_cache()
415
C:\Anaconda\lib\site-packages\pandas\core\internals.pyc in set_axis(self, axis, new_labels)
2217 if new_len != old_len:
2218 raise ValueError('Length mismatch: Expected axis has %d elements, '
-> 2219 'new values have %d elements' % (old_len, new_len))
2220
2221 self.axes[axis] = new_labels
ValueError: Length mismatch: Expected axis has 4 elements, new values have 2 elements
However, not specifying a level works as does using .loc as seen here:
In [16]:
print(subset.xs(6))
print(subset.loc[6])
tstep value jday normed_value datetime
p
407 0 1 564.5 5.75 1964-07-18 12:00:00
407 0 1 564.5 5.75 1964-07-18 12:00:00
tstep value jday normed_value datetime
p
407 0 1 564.5 5.75 1964-07-18 12:00:00
407 0 1 564.5 5.75 1964-07-18 12:00:00
Does any one have some insight on this behavior?
Until the following issue (https://github.com/pydata/pandas/issues/13719) is closed the following is a fix:
subset.xs((6,), level=['m'])