Get output of statsmodels acf function with a Pandas rolling window - python

I can extract an autocorrelation value for a specific lag time with this:
df.rolling(window = 10).apply(lambda x: acf(x, nlags = 5)[5]).plot()
However since acf is actually doing all the calculations anyway, I'd like to get all the results calculated, not just a single one. The idea would be that I could then unpack this single returned array/list into a bunch of columns and plot each one separately but not run through acf so many unnecessary times. So I tried:
df.rolling(window = 10).apply(lambda x: list(acf(x, nlags = 5)))
This throws the following error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-63-e5f337100eb5> in <module>()
----> 1 df.rolling(window = 10).apply(lambda x: list(acf(x, nlags = 5)))
/Users/a/anaconda3/lib/python3.5/site-packages/pandas/core/window.py in apply(self, func, args, kwargs)
861 #Appender(_shared_docs['apply'])
862 def apply(self, func, args=(), kwargs={}):
--> 863 return super(Rolling, self).apply(func, args=args, kwargs=kwargs)
864
865 #Substitution(name='rolling')
/Users/a/anaconda3/lib/python3.5/site-packages/pandas/core/window.py in apply(self, func, args, kwargs)
619
620 return self._apply(f, func, args=args, kwargs=kwargs,
--> 621 center=False)
622
623 def sum(self, **kwargs):
/Users/a/anaconda3/lib/python3.5/site-packages/pandas/core/window.py in _apply(self, func, name, window, center, check_minp, how, **kwargs)
556
557 if values.ndim > 1:
--> 558 result = np.apply_along_axis(calc, self.axis, values)
559 else:
560 result = calc(values)
/Users/a/anaconda3/lib/python3.5/site-packages/numpy/lib/shape_base.py in apply_along_axis(func1d, axis, arr, *args, **kwargs)
89 outshape = asarray(arr.shape).take(indlist)
90 i.put(indlist, ind)
---> 91 res = func1d(arr[tuple(i.tolist())], *args, **kwargs)
92 # if res is a number, then we have a smaller output array
93 if isscalar(res):
/Users/a/anaconda3/lib/python3.5/site-packages/pandas/core/window.py in calc(x)
553
554 def calc(x):
--> 555 return func(x, window, min_periods=self.min_periods)
556
557 if values.ndim > 1:
/Users/a/anaconda3/lib/python3.5/site-packages/pandas/core/window.py in f(arg, window, min_periods)
616 minp = _use_window(min_periods, window)
617 return algos.roll_generic(arg, window, minp, offset, func, args,
--> 618 kwargs)
619
620 return self._apply(f, func, args=args, kwargs=kwargs,
pandas/algos.pyx in pandas.algos.roll_generic (pandas/algos.c:51581)()
TypeError: a float is required
Does this mean apply style operations with rolling can only handle floats? At least for groupby I have often had occasion to return lists or sets, but perhaps rolling is not so flexible?

To plot acf results you may want to try tsaplots.plot_acf():
from statsmodels.graphics import tsaplots
tsaplots.plot_acf(x, lags = 5, alpha = 0.05)

Related

Unit issue with MetPy's parcel_profile function

I have been working on programming to plot Skew_Ts from Wyoming's weather servers. The issue I am having is I get an error when attempting to run the parcel_profile function, it says it can not convert from dimensionless to hectopascals. The pressure array being fed into the function as well as the temperature and dewpoint data point have the appropriate units attached though. To add to my confusion, I have the exact same coding on another machine with the same library versions and it runs fine on that one. Am I missing an obvious problem? Code and relevant library versions are listed below:
import metpy as mp
from metpy.units import units
import metpy.calc as mpcalc
from siphon.simplewebservice.wyoming import WyomingUpperAir
from datetime import datetime
import pandas as pd
import numpy as np
final_time = datetime(2022, 1, 21, 12)
station = 'ABQ'
df = WyomingUpperAir.request_data(final_time, station)
data_dict = {"Press":"", "Temp": "", "Dew_Point": "", "Height":"",
"Mask": "", "Parcel": "", "Idx": "", "U": "", "V": ""}
data_dict['Press'] = df['pressure'].values * units(df.units['pressure'])
data_dict['Temp'] = df['temperature'].values * units(df.units['temperature'])
data_dict['Dew_Point'] = df['dewpoint'].values * units(df.units['dewpoint'])
data_dict['Height'] = df['height'].values * units(df.units['height'])
data_dict['U'] = df['u_wind'].values * units(df.units['u_wind'])
data_dict['V'] = df['v_wind'].values * units(df.units['v_wind'])
data_dict['Parcel'] = mpcalc.parcel_profile(data_dict['Press'],
data_dict['Temp'][0],
data_dict['Dew_Point'][0]).to('degC')
Error:
DimensionalityError Traceback (most recent call last)
C:\Users\####################.py in <module>
----> 1 data_dict['Parcel'] = mpcalc.parcel_profile(data_dict['Press'],
2 data_dict['Temp'][0],
3 data_dict['Dew_Point'][0]).to('degC')
~\anaconda3\envs\Met_World\lib\site-packages\metpy\xarray.py in wrapper(*args, **kwargs)
1214
1215 # Evaluate inner calculation
-> 1216 result = func(*bound_args.args, **bound_args.kwargs)
1217
1218 # Wrap output based on match and match_unit
~\anaconda3\envs\Met_World\lib\site-packages\metpy\units.py in wrapper(*args, **kwargs)
244 'that the function is being called properly.\n') + msg
245 raise ValueError(msg)
--> 246 return func(*args, **kwargs)
247
248 return wrapper
~\anaconda3\envs\Met_World\lib\site-packages\metpy\calc\thermo.py in parcel_profile(pressure, temperature, dewpoint)
737
738 """
--> 739 _, _, _, t_l, _, t_u = _parcel_profile_helper(pressure, temperature, dewpoint)
740 return concatenate((t_l, t_u))
741
~\anaconda3\envs\Met_World\lib\site-packages\metpy\calc\thermo.py in _parcel_profile_helper(pressure, temperature, dewpoint)
892
893 # If the pressure profile doesn't make it to the lcl, we can stop here
--> 894 if _greater_or_close(np.nanmin(pressure), press_lcl):
895 return (press_lower[:-1], press_lcl, units.Quantity(np.array([]), press_lower.units),
896 temp_lower[:-1], temp_lcl, units.Quantity(np.array([]), temp_lower.units))
~\anaconda3\envs\Met_World\lib\site-packages\metpy\calc\tools.py in _greater_or_close(a, value, **kwargs)
738
739 """
--> 740 return (a > value) | np.isclose(a, value, **kwargs)
741
742
~\anaconda3\envs\Met_World\lib\site-packages\pint\quantity.py in __array_ufunc__(self, ufunc, method, *inputs, **kwargs)
1721 )
1722
-> 1723 return numpy_wrap("ufunc", ufunc, inputs, kwargs, types)
1724
1725 def __array_function__(self, func, types, args, kwargs):
~\anaconda3\envs\Met_World\lib\site-packages\pint\numpy_func.py in numpy_wrap(func_type, func, args, kwargs, types)
919 if name not in handled or any(is_upcast_type(t) for t in types):
920 return NotImplemented
--> 921 return handled[name](*args, **kwargs)
~\anaconda3\envs\Met_World\lib\site-packages\pint\numpy_func.py in implementation(*args, **kwargs)
284 if input_units == "all_consistent":
285 # Match all input args/kwargs to same units
--> 286 stripped_args, stripped_kwargs = convert_to_consistent_units(
287 *args, pre_calc_units=first_input_units, **kwargs
288 )
~\anaconda3\envs\Met_World\lib\site-packages\pint\numpy_func.py in convert_to_consistent_units(pre_calc_units, *args, **kwargs)
105 """
106 return (
--> 107 tuple(convert_arg(arg, pre_calc_units=pre_calc_units) for arg in args),
108 {
109 key: convert_arg(arg, pre_calc_units=pre_calc_units)
~\anaconda3\envs\Met_World\lib\site-packages\pint\numpy_func.py in <genexpr>(.0)
105 """
106 return (
--> 107 tuple(convert_arg(arg, pre_calc_units=pre_calc_units) for arg in args),
108 {
109 key: convert_arg(arg, pre_calc_units=pre_calc_units)
~\anaconda3\envs\Met_World\lib\site-packages\pint\numpy_func.py in convert_arg(arg, pre_calc_units)
87 return arg
88 else:
---> 89 raise DimensionalityError("dimensionless", pre_calc_units)
90 elif _is_quantity(arg):
91 return arg.m
DimensionalityError: Cannot convert from 'dimensionless' to 'hectopascal'
Libraries used:
python 3.9.7
metpy 1.1.0
pandas 1.2.4
numpy 1.22.0
xarray 0.20.2
My first guess is that this is a problem with multiplying whatever e.g. df['u_wind'].values is returning by units. While it's a nicer syntax, the more robust way is to use the Quantity constructor:
data_dict['Press'] = units.Quantity(df['pressure'].values, units(df.units['pressure']))
You can shorten all of that, though, and use the Quantity() method by using MetPy's helper metpy.units.pandas_dataframe_to_unit_arrays:
data_dict = units.pandas_dataframe_to_unit_arrays(df)
If you want the column names you were originally using, you can change them with df.rename().

Dask.Series .mean().compute() results in "TypeError: _sum() got an unexpected keyword argument 'skipna' "

I do not grasp how the dask module uses lazy evaluations under the hood.
In the minimal example the numbers replaces a significant larger data set, hence the float64 (overflow).
What is the error in this "dask-like" syntax and why does it produces this (seemingly unrelated?) Error.
import pandas as pd
import numpy as np
from dask import dataframe as dd
df = pd.DataFrame({'foo': [[1,2,3], [4,5,6]]})
ddf = dd.from_pandas(df, npartitions=2)
In pandas I would do
In[1]: df['foo'].apply(np.float64).mean()
Out[1]: array([2.5, 3.5, 4.5])
What is the same as
In[2]: ddf['foo'].apply(np.float64, meta=('foo','f8')).compute().mean()
Out[2]: array([2.5, 3.5, 4.5])
as I evaluates first the pandas.Series and then calculating the mean.
If I want to make the whole calculation lazy I tried
In[3]: ddf['foo'].apply(np.float64, meta=('foo','f8')).mean().compute()
but I do not understand the TypeError.
The stack trace:
TypeError Traceback (most recent call last)
<ipython-input-6-8704408e68ef> in <module>
----> 1 ddf['foo'].apply(np.float64, meta=('foo','f8')).mean().compute()
~/miniconda3/envs/flash-hdf/lib/python3.8/site-packages/dask/base.py in compute(self, **kwargs)
277 dask.base.compute
278 """
--> 279 (result,) = compute(self, traverse=False, **kwargs)
280 return result
281
~/miniconda3/envs/flash-hdf/lib/python3.8/site-packages/dask/base.py in compute(*args, **kwargs)
559 postcomputes.append(x.__dask_postcompute__())
560
--> 561 results = schedule(dsk, keys, **kwargs)
562 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
563
~/miniconda3/envs/flash-hdf/lib/python3.8/site-packages/dask/threaded.py in get(dsk, result, cache, num_workers, pool, **kwargs)
74 pools[thread][num_workers] = pool
75
---> 76 results = get_async(
77 pool.apply_async,
78 len(pool._pool),
~/miniconda3/envs/flash-hdf/lib/python3.8/site-packages/dask/local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
485 _execute_task(task, data) # Re-execute locally
486 else:
--> 487 raise_exception(exc, tb)
488 res, worker_id = loads(res_info)
489 state["cache"][key] = res
~/miniconda3/envs/flash-hdf/lib/python3.8/site-packages/dask/local.py in reraise(exc, tb)
315 if exc.__traceback__ is not tb:
316 raise exc.with_traceback(tb)
--> 317 raise exc
318
319
~/miniconda3/envs/flash-hdf/lib/python3.8/site-packages/dask/local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
220 try:
221 task, data = loads(task_info)
--> 222 result = _execute_task(task, data)
223 id = get_id()
224 result = dumps((result, id))
~/miniconda3/envs/flash-hdf/lib/python3.8/site-packages/dask/core.py in _execute_task(arg, cache, dsk)
119 # temporaries by their reference count and can execute certain
120 # operations in-place.
--> 121 return func(*(_execute_task(a, cache) for a in args))
122 elif not ishashable(arg):
123 return arg
~/miniconda3/envs/flash-hdf/lib/python3.8/site-packages/dask/utils.py in apply(func, args, kwargs)
29 def apply(func, args, kwargs=None):
30 if kwargs:
---> 31 return func(*args, **kwargs)
32 else:
33 return func(*args)
~/miniconda3/envs/flash-hdf/lib/python3.8/site-packages/dask/dataframe/core.py in _reduction_aggregate(x, aca_aggregate, **kwargs)
6289 if isinstance(x, list):
6290 x = pd.Series(x)
-> 6291 return aca_aggregate(x, **kwargs)
6292
6293
~/miniconda3/envs/flash-hdf/lib/python3.8/site-packages/dask/utils.py in __call__(self, obj, *args, **kwargs)
895
896 def __call__(self, obj, *args, **kwargs):
--> 897 return getattr(obj, self.method)(*args, **kwargs)
898
899 def __reduce__(self):
TypeError: _sum() got an unexpected keyword argument 'skipna'
If you could provide me with a hint to where find the answer, it would really helpful!
Python 3.8.6
Numpy 1.19.4
Dask 2021.01.0
Pandas 1.1.4

Pandas dataframe styling: highlight some cells based on a format column

Problem description
I have a DataFrame in which last column is a format column. The purpose of this column is to contain the format of the DataFrame row.
Here is an example of such a dataframe:
df = pd.DataFrame({'ID': [1, 24, 31, 37],
'Status': ['to analyze', 'to analyze','to analyze','analyzed'],
'priority' : ['P1','P1','P2','P1'],
'format' : ['n;y;n','n;n;n','n;y;y','y;n;y']}
Each df['format'] row contains a string intended to be taken as a list (when split) to give the format of the row.
Symbols meaning:
n means "no highlight"
y means "to highlight in yellow"
df['format'].to_list()[0] = 'n;y;n' means for example:
n: first column ID item "1" not highlighted
y: second column Status item "to analyze" to be highlighted
n: third column Priority item "P1" not highlighted
So that expected outcome is:
What I've tried
I've tried to use df.format to get a list of lists containing the format needed. Here is my code:
import pandas as pd
import numpy as np
def highlight_case(df):
list_of_format_lists = []
for format_line in df['format']:
format_line_list = format_line.split(';')
format_list = []
for form in format_line_list:
if 'y' in form:
format_list.append('background-color: yellow')
else:
format_list.append('')
list_of_format_lists.append(format_list)
list_of_format_lists = list(map(list, zip(*list_of_format_lists)))#transpose
print(list_of_format_lists)
return list_of_format_lists
highlight_style = highlight_case(df)
df.style.apply(highlight_style)
It doesn't work, and I get this output:
TypeError Traceback (most recent call last)
c:\python38\lib\site-packages\IPython\core\formatters.py in __call__(self, obj)
343 method = get_real_method(obj, self.print_method)
344 if method is not None:
--> 345 return method()
346 return None
347 else:
c:\python38\lib\site-packages\pandas\io\formats\style.py in _repr_html_(self)
191 Hooks into Jupyter notebook rich display system.
192 """
--> 193 return self.render()
194
195 #doc(NDFrame.to_excel, klass="Styler")
c:\python38\lib\site-packages\pandas\io\formats\style.py in render(self, **kwargs)
538 * table_attributes
539 """
--> 540 self._compute()
541 # TODO: namespace all the pandas keys
542 d = self._translate()
c:\python38\lib\site-packages\pandas\io\formats\style.py in _compute(self)
623 r = self
624 for func, args, kwargs in self._todo:
--> 625 r = func(self)(*args, **kwargs)
626 return r
627
c:\python38\lib\site-packages\pandas\io\formats\style.py in _apply(self, func, axis, subset, **kwargs)
637 data = self.data.loc[subset]
638 if axis is not None:
--> 639 result = data.apply(func, axis=axis, result_type="expand", **kwargs)
640 result.columns = data.columns
641 else:
c:\python38\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
7543 kwds=kwds,
7544 )
-> 7545 return op.get_result()
7546
7547 def applymap(self, func) -> "DataFrame":
c:\python38\lib\site-packages\pandas\core\apply.py in get_result(self)
142 # dispatch to agg
143 if is_list_like(self.f) or is_dict_like(self.f):
--> 144 return self.obj.aggregate(self.f, axis=self.axis, *self.args, **self.kwds)
145
146 # all empty
c:\python38\lib\site-packages\pandas\core\frame.py in aggregate(self, func, axis, *args, **kwargs)
7353 axis = self._get_axis_number(axis)
7354
-> 7355 relabeling, func, columns, order = reconstruct_func(func, **kwargs)
7356
7357 result = None
c:\python38\lib\site-packages\pandas\core\aggregation.py in reconstruct_func(func, **kwargs)
74
75 if not relabeling:
---> 76 if isinstance(func, list) and len(func) > len(set(func)):
77
78 # GH 28426 will raise error if duplicated function names are used and
TypeError: unhashable type: 'list'
Since the formats are encoded for each row, it makes sense apply row-wise:
def format_row(r):
formats = r['format'].split(';')
return ['background-color: yellow' if y=='y' else '' for y in formats] + ['']
df.style.apply(format_row, axis=1)
Output:

Error running Theil-Sen Regression in Python

I have a dataframe similar to the following, which we'll call "df":
id value time
a 1 1
a 1.5 2
a 2 3
a 2.5 4
b 1 1
b 1.5 2
b 2 3
b 2.5 4
I am running various regressions by "id" in Python on this dataframe. Generally, this requires a grouping by "id" and then applying a function to those groupings that calculates the regression.
I am working with 2 similar regression techniques in Scipy's stats library:
Theil-Sen estimator:
(https://docs.scipy.org/doc/scipy-0.15.1/reference/generated/scipy.stats.mstats.theilslopes.html)
Siegel estimator:
(https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.siegelslopes.html).
Both of these intake the same type of data. Therefore the function to calculate them should be the same aside from the actual technique used.
For Theil-Sen, I wrote the following function and the groupby statement that would be applied to that function:
def theil_reg(df, xcol, ycol):
model = stats.theilslopes(ycol,xcol)
return pd.Series(model)
out = df.groupby('id').apply(theil_reg, xcol='time', ycol='value')
However, I get the following error, which I've been having the hardest time understanding how to address:
ValueError: could not convert string to float: 'time'
The actual variable time is a numpy float object, so it isn't a string. This makes me believe that the stats.theilslopes function is not recognizing that time is a column in the dataframe and is instead using 'time' as a string input into the function.
However if that's the case, then this seems to be a bug in the stats.theilslopes package, and would need to be addressed by Scipy. The reason I believe this to be the case is because the exact same function as above, but instead using the siegelslopes package, works perfectly fine and provides the output I'm expecting, and they're essentially the same estimation with the same inputs.
Doing the following on Siegel:
def siegel_reg(df, xcol, ycol):
model = stats.siegelslopes(ycol,xcol)
return pd.Series(model)
out = df.groupby('id').apply(siegel_reg, xcol='time',ycol='value')
Does not create any errors about the time variable and conducts the regression as needed.
Does anyone have thoughts on whether I'm missing something here? If so I would appreciate any thoughts, or if not, any thoughts on how to address this with Scipy.
Edit: here is the full error message that shows up when I run this script:
ValueError Traceback (most recent call last)
C:\Anaconda\lib\site-packages\pandas\core\groupby\groupby.py in apply(self, func, *args, **kwargs)
688 try:
--> 689 result = self._python_apply_general(f)
690 except Exception:
C:\Anaconda\lib\site-packages\pandas\core\groupby\groupby.py in _python_apply_general(self, f)
706 keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 707 self.axis)
708
C:\Anaconda\lib\site-packages\pandas\core\groupby\ops.py in apply(self, f, data, axis)
189 group_axes = _get_axes(group)
--> 190 res = f(group)
191 if not _is_indexed_like(res, group_axes):
C:\Anaconda\lib\site-packages\pandas\core\groupby\groupby.py in f(g)
678 with np.errstate(all='ignore'):
--> 679 return func(g, *args, **kwargs)
680 else:
<ipython-input-506-0a1696f0aecd> in theil_reg(df, xcol, ycol)
1 def theil_reg(df, xcol, ycol):
----> 2 model = stats.theilslopes(ycol,xcol)
3 return pd.Series(model)
C:\Anaconda\lib\site-packages\scipy\stats\_stats_mstats_common.py in
theilslopes(y, x, alpha)
221 else:
--> 222 x = np.array(x, dtype=float).flatten()
223 if len(x) != len(y):
ValueError: could not convert string to float: 'time'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-507-9a199e0ce924> in <module>
----> 1 df_accel_correct.groupby('chart').apply(theil_reg, xcol='time',
ycol='value')
C:\Anaconda\lib\site-packages\pandas\core\groupby\groupby.py in apply(self, func, *args, **kwargs)
699
700 with _group_selection_context(self):
--> 701 return self._python_apply_general(f)
702
703 return result
C:\Anaconda\lib\site-packages\pandas\core\groupby\groupby.py in _python_apply_general(self, f)
705 def _python_apply_general(self, f):
706 keys, values, mutated = self.grouper.apply(f,
self._selected_obj,
--> 707 self.axis)
708
709 return self._wrap_applied_output(
C:\Anaconda\lib\site-packages\pandas\core\groupby\ops.py in apply(self, f, data, axis)
188 # group might be modified
189 group_axes = _get_axes(group)
--> 190 res = f(group)
191 if not _is_indexed_like(res, group_axes):
192 mutated = True
C:\Anaconda\lib\site-packages\pandas\core\groupby\groupby.py in f(g)
677 def f(g):
678 with np.errstate(all='ignore'):
--> 679 return func(g, *args, **kwargs)
680 else:
681 raise ValueError('func must be a callable if args or '
<ipython-input-506-0a1696f0aecd> in theil_reg(df, xcol, ycol)
1 def theil_reg(df, xcol, ycol):
----> 2 model = stats.theilslopes(ycol,xcol)
3 return pd.Series(model)
C:\Anaconda\lib\site-packages\scipy\stats\_stats_mstats_common.py in theilslopes(y, x, alpha)
220 x = np.arange(len(y), dtype=float)
221 else:
--> 222 x = np.array(x, dtype=float).flatten()
223 if len(x) != len(y):
224 raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y), len(x)))
ValueError: could not convert string to float: 'time'
Update 2: after calling df in the function, I received the following error message:
ValueError Traceback (most recent call last)
C:\Anaconda\lib\site-packages\pandas\core\groupby\groupby.py in apply(self, func, *args, **kwargs)
688 try:
--> 689 result = self._python_apply_general(f)
690 except Exception:
C:\Anaconda\lib\site-packages\pandas\core\groupby\groupby.py in _python_apply_general(self, f)
706 keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 707 self.axis)
708
C:\Anaconda\lib\site-packages\pandas\core\groupby\ops.py in apply(self, f, data, axis)
189 group_axes = _get_axes(group)
--> 190 res = f(group)
191 if not _is_indexed_like(res, group_axes):
C:\Anaconda\lib\site-packages\pandas\core\groupby\groupby.py in f(g)
678 with np.errstate(all='ignore'):
--> 679 return func(g, *args, **kwargs)
680 else:
<ipython-input-563-5db69048f347> in theil_reg(df, xcol, ycol)
1 def theil_reg(df, xcol, ycol):
----> 2 model = stats.theilslopes(df[ycol],df[xcol])
3 return pd.Series(model)
C:\Anaconda\lib\site-packages\scipy\stats\_stats_mstats_common.py in theilslopes(y, x, alpha)
248 sigma = np.sqrt(sigsq)
--> 249 Ru = min(int(np.round((nt - z*sigma)/2.)), len(slopes)-1)
250 Rl = max(int(np.round((nt + z*sigma)/2.)) - 1, 0)
ValueError: cannot convert float NaN to integer
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-564-d7794bd1d495> in <module>
----> 1 correct_theil = df_accel_correct.groupby('chart').apply(theil_reg, xcol='time', ycol='value')
C:\Anaconda\lib\site-packages\pandas\core\groupby\groupby.py in apply(self, func, *args, **kwargs)
699
700 with _group_selection_context(self):
--> 701 return self._python_apply_general(f)
702
703 return result
C:\Anaconda\lib\site-packages\pandas\core\groupby\groupby.py in _python_apply_general(self, f)
705 def _python_apply_general(self, f):
706 keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 707 self.axis)
708
709 return self._wrap_applied_output(
C:\Anaconda\lib\site-packages\pandas\core\groupby\ops.py in apply(self, f, data, axis)
188 # group might be modified
189 group_axes = _get_axes(group)
--> 190 res = f(group)
191 if not _is_indexed_like(res, group_axes):
192 mutated = True
C:\Anaconda\lib\site-packages\pandas\core\groupby\groupby.py in f(g)
677 def f(g):
678 with np.errstate(all='ignore'):
--> 679 return func(g, *args, **kwargs)
680 else:
681 raise ValueError('func must be a callable if args or '
<ipython-input-563-5db69048f347> in theil_reg(df, xcol, ycol)
1 def theil_reg(df, xcol, ycol):
----> 2 model = stats.theilslopes(df[ycol],df[xcol])
3 return pd.Series(model)
C:\Anaconda\lib\site-packages\scipy\stats\_stats_mstats_common.py in theilslopes(y, x, alpha)
247 # Find the confidence interval indices in `slopes`
248 sigma = np.sqrt(sigsq)
--> 249 Ru = min(int(np.round((nt - z*sigma)/2.)), len(slopes)-1)
250 Rl = max(int(np.round((nt + z*sigma)/2.)) - 1, 0)
251 delta = slopes[[Rl, Ru]]
ValueError: cannot convert float NaN to integer
However, I have no null values in either column, and both columns are floats. Any suggestions on this error?
Essentially, you are passing the string values of column names (not any value entities) into methods but the slopes calls require numpy arrays (or pandas series that can be coerced into arrays). Specifically, you are attempting this call with no reference to df and hence your error:
model = stats.theilslopes('value', 'time')
Simply reference df in the calls:
model = stats.theilslopes(df['value'], df['time'])
model = stats.theilslopes(df[ycol], df[xcol])
Regarding different results across packages does not mean bugs with Scipy. Packages run different implementations. Read docs carefully to see how to call methods. Possibly, the other package you refer to allows a data input as argument inside call and the named strings reference the columns like below:
slopes_call(y='y_string', x='x_string', data=df)
In general, the Python object model always requires explicit named references to calls and objects and does not assume context.

Trying to apply fftconvolve over xarray

I have a 3D xarray DataArray and a 1D xarray DataArray. One dimension of the 3D array has the same size as the single dimension of the other. I want to apply scipy.signal.fftconvolve along these corresponding dimensions. But I get an error. Here is some minimal example code:
import numpy as np
from scipy.signal import fftconvolve
import xarray as xr
xarr1 = xr.DataArray(np.random.random([10,20,500]),
dims=('dim1', 'dim2', 'sample'))
xarr2 = xr.DataArray(np.random.random(500),
dims=('sample',))
res = xr.apply_ufunc(fftconvolve, xarr1, xarr2,
input_core_dims=[['sample'], ['sample']],
kwargs={'mode': 'same'},
vectorize=True)
What I want to do is the equivalent of this:
res = np.zeros(xarr1.shape)
for i1 in range(xarr1.shape[0]):
for i2 in range(xarr1.shape[1]):
res[i1, i2, :] = fftconvolve(xarr1[i1, i2, :], xarr2, mode='same')
But I get the following error with the apply_ufunc version:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-174-4f45f68ab252> in <module>()
7 xarr2 = xr.DataArray(np.random.random(500),
8 dims=('sample',))
----> 9 res = xr.apply_ufunc(fftconvolve, xarr1, xarr2, input_core_dims=[['sample'], ['sample']], kwargs={'mode': 'same'}, vectorize=True)
~\Anaconda3\envs\dataproc\lib\site-packages\xarray\core\computation.py in apply_ufunc(func, *args, **kwargs)
932 join=join,
933 exclude_dims=exclude_dims,
--> 934 keep_attrs=keep_attrs)
935 elif any(isinstance(a, Variable) for a in args):
936 return variables_ufunc(*args)
~\Anaconda3\envs\dataproc\lib\site-packages\xarray\core\computation.py in apply_dataarray_ufunc(func, *args, **kwargs)
209
210 data_vars = [getattr(a, 'variable', a) for a in args]
--> 211 result_var = func(*data_vars)
212
213 if signature.num_outputs > 1:
~\Anaconda3\envs\dataproc\lib\site-packages\xarray\core\computation.py in apply_variable_ufunc(func, *args, **kwargs)
563 raise ValueError('unknown setting for dask array handling in '
564 'apply_ufunc: {}'.format(dask))
--> 565 result_data = func(*input_data)
566
567 if signature.num_outputs > 1:
~\Anaconda3\envs\dataproc\lib\site-packages\numpy\lib\function_base.py in __call__(self, *args, **kwargs)
2737 vargs.extend([kwargs[_n] for _n in names])
2738
-> 2739 return self._vectorize_call(func=func, args=vargs)
2740
2741 def _get_ufunc_and_otypes(self, func, args):
~\Anaconda3\envs\dataproc\lib\site-packages\numpy\lib\function_base.py in _vectorize_call(self, func, args)
2803 """Vectorized call to `func` over positional `args`."""
2804 if self.signature is not None:
-> 2805 res = self._vectorize_call_with_signature(func, args)
2806 elif not args:
2807 res = func()
~\Anaconda3\envs\dataproc\lib\site-packages\numpy\lib\function_base.py in _vectorize_call_with_signature(self, func, args)
2867
2868 for output, result in zip(outputs, results):
-> 2869 output[index] = result
2870
2871 if outputs is None:
ValueError: setting an array element with a sequence.
Specifying output_core_dims solves this,
res = xr.apply_ufunc(fftconvolve, xarr1, xarr2,
input_core_dims=[['sample'], ['sample']],
output_core_dims=[['sample']],
kwargs={'mode': 'same'}, vectorize=True)

Categories