Groupby on non-unique axis with Pandas Panel - python

I have a pandas Panel with a non-unique major_axis and I am trying to sum the non unique rows using groupby, but I get an error saying that the major_axis is not iterable. I have searched stack overflow and the message board, but it seems like the Panel is not as widely used as the dataframe.
Here is an example that produces there error:
import pandas as pd
import datetime as dt
import dateutil.relativedelta as rd
import numpy as np
items = ['A','B']
minor_axis = ['x','y']
diff = rd.relativedelta(years=1)
major_axis = [dt.date(2013,1,1) + (diff * shift) for shift in xrange(4)] * 2
values = np.random.randn(2,8,2)
data = pd.Panel(data=values, major_axis=major_axis, minor_axis=minor_axis, items=items)
data.groupby(sum, axis='major')
and here is the stacktrace:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-29-e30fb9b32fce> in <module>()
----> 1 data.groupby(sum, axis='major')
/home/brendan/python_dev/venv/local/lib/python2.7/site-packages/pandas/core/panel.pyc in groupby(self, function, axis)
1084 from pandas.core.groupby import PanelGroupBy
1085 axis = self._get_axis_number(axis)
-> 1086 return PanelGroupBy(self, function, axis=axis)
1087
1088 def swapaxes(self, axis1='major', axis2='minor', copy=True):
/home/brendan/python_dev/venv/local/lib/python2.7/site-packages/pandas/core/groupby.pyc in __init__(self, obj, keys, axis, level, grouper, exclusions, selection, as_index, sort, group_keys, squeeze)
195 if grouper is None:
196 grouper, exclusions = _get_grouper(obj, keys, axis=axis,
--> 197 level=level, sort=sort)
198
199 self.grouper = grouper
/home/brendan/python_dev/venv/local/lib/python2.7/site-packages/pandas/core/groupby.pyc in _get_grouper(obj, key, axis, level, sort)
1323 raise AssertionError(errmsg)
1324
-> 1325 ping = Grouping(group_axis, gpr, name=name, level=level, sort=sort)
1326 groupings.append(ping)
1327
/home/brendan/python_dev/venv/local/lib/python2.7/site-packages/pandas/core/groupby.pyc in __init__(self, index, grouper, name, level, sort)
1197 # no level passed
1198 if not isinstance(self.grouper, np.ndarray):
-> 1199 self.grouper = self.index.map(self.grouper)
1200 if not (hasattr(self.grouper,"__len__") and \
1201 len(self.grouper) == len(self.index)):
/home/brendan/python_dev/venv/local/lib/python2.7/site-packages/pandas/core/index.pyc in map(self, mapper)
856
857 def map(self, mapper):
--> 858 return self._arrmap(self.values, mapper)
859
860 def isin(self, values):
/home/brendan/python_dev/venv/local/lib/python2.7/site-packages/pandas/algos.so in pandas.algos.arrmap_object (pandas/algos.c:62269)()
TypeError: 'datetime.date' object is not iterable
Any ideas about how to handle this situation?
Many thanks,
Brendan

In 0.12 you can try
>>> data.groupby(np.sum, axis='major')
<pandas.core.groupby.PanelGroupBy object at 0x1a2ba50>

The answer of #alko is indeed the solution to your question, although I think you misunderstand the groupby. You still need to apply a function or aggregation on the groupby() call, in your case to sum all items in a group data.groupby(..).sum().
But I would recommend to consider if you need to use a Panel. Of course I don't know your case, but in many case using a MultiIndex can solve the problem.
Your panel and groupby would look like the following:
>>> items = ['A', 'A', 'B', 'B']
>>> minor_axis = ['x','y', 'x', 'y']
>>> diff = rd.relativedelta(years=1)
>>> major_axis = [dt.date(2013,1,1) + (diff * shift) for shift in xrange(4)] * 2
>>> values = np.random.randn(8,4)
>>>
>>> data = pd.DataFrame(values, index=major_axis, columns=pd.MultiIndex.from_arrays([items, minor_axis]))
>>> data
A B
x y x y
2013-01-01 -1.063086 0.564123 0.128006 -0.658767
2014-01-01 2.182473 -0.851618 1.180264 0.165581
2015-01-01 -0.003941 0.590801 -1.616197 -2.270557
2016-01-01 -0.736524 0.172791 1.220589 -1.303294
2013-01-01 -1.052184 -1.171545 -0.473488 -0.140327
2014-01-01 0.021189 0.827241 0.775863 -0.882874
2015-01-01 -1.762289 0.705692 0.593365 -0.984109
2016-01-01 -1.946106 -1.108336 -1.691758 -0.088932
>>> data.groupby(data.index).sum()
A B
x y x y
2013-01-01 -2.115270 -0.607422 -0.345482 -0.799094
2014-01-01 2.203662 -0.024377 1.956127 -0.717293
2015-01-01 -1.766230 1.296492 -1.022832 -3.254667
2016-01-01 -2.682630 -0.935544 -0.471170 -1.392226

Related

DataError: No numeric types to aggregate pandas pivot

I have a pandas dataframe like this:
User-Id Training-Id TrainingTaken
0 4327024 25 10
1 6662572 3 10
2 3757520 26 10
and I need to convert it to a Matrix like they do here:
https://github.com/tr1ten/Anime-Recommender-System/blob/main/HybridRecommenderSystem.ipynb
Cell 13.
So I did the following:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas_profiling
from scipy.sparse import csr_matrix
from lightfm.evaluation import auc_score
from lightfm.data import Dataset
user_training_interaction = pd.pivot_table(trainingtaken, index='User-Id', columns='Training-Id', values='TrainingTaken')
user_training_interaction.fillna(0,inplace=True)
user_training_csr = csr_matrix(user_training_interaction.values)
But I get this error:
---------------------------------------------------------------------------
DataError Traceback (most recent call last)
<ipython-input-96-5a2c7ba28976> in <module>
10 from lightfm.data import Dataset
11
---> 12 user_training_interaction = pd.pivot_table(trainingtaken, index='User-Id', columns='Training-Id', values='TrainingTaken')
13 user_training_interaction.fillna(0,inplace=True)
14 user_training_csr = csr_matrix(user_training_interaction.values)
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/reshape/pivot.py in pivot_table(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed)
110
111 grouped = data.groupby(keys, observed=observed)
--> 112 agged = grouped.agg(aggfunc)
113 if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
114 agged = agged.dropna(how="all")
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/groupby/generic.py in aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
949 func = maybe_mangle_lambdas(func)
950
--> 951 result, how = self._aggregate(func, *args, **kwargs)
952 if how is None:
953 return result
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/base.py in _aggregate(self, arg, *args, **kwargs)
305
306 if isinstance(arg, str):
--> 307 return self._try_aggregate_string_function(arg, *args, **kwargs), None
308
309 if isinstance(arg, dict):
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/base.py in _try_aggregate_string_function(self, arg, *args, **kwargs)
261 if f is not None:
262 if callable(f):
--> 263 return f(*args, **kwargs)
264
265 # people may try to aggregate on a non-callable attribute
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in mean(self, numeric_only)
1396 "mean",
1397 alt=lambda x, axis: Series(x).mean(numeric_only=numeric_only),
-> 1398 numeric_only=numeric_only,
1399 )
1400
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/groupby/generic.py in _cython_agg_general(self, how, alt, numeric_only, min_count)
1020 ) -> DataFrame:
1021 agg_blocks, agg_items = self._cython_agg_blocks(
-> 1022 how, alt=alt, numeric_only=numeric_only, min_count=min_count
1023 )
1024 return self._wrap_agged_blocks(agg_blocks, items=agg_items)
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/groupby/generic.py in _cython_agg_blocks(self, how, alt, numeric_only, min_count)
1128
1129 if not (agg_blocks or split_frames):
-> 1130 raise DataError("No numeric types to aggregate")
1131
1132 if split_items:
DataError: No numeric types to aggregate
What am I missing?
The Pandas Documentation states:
While pivot() provides general purpose pivoting with various data
types (strings, numerics, etc.), pandas also provides pivot_table()
for pivoting with aggregation of numeric data
Make sure the column is numeric. Without seeing how you create trainingtaken I can't provide more specific guidance. However the following may help:
Make sure you handle "empty" values in that column. The Pandas guide is a very good place to start. Pandas points out that "a column of integers with even one missing values is cast to floating-point dtype".
If working with a dataframe, the column can be cast to a specific type via your_df.your_col.astype(int) or for your example, pd.trainingtaken.astype(int)

How can I iterate through elements of a koala groupby?

I would like to iterate through groups in a dataframe. This is possible in pandas, but when I port this to koalas, I get an error.
import databricks.koalas as ks
import pandas as pd
pdf = pd.DataFrame({'x':range(3), 'y':['a','b','b'], 'z':['a','b','b']})
# Create a Koalas DataFrame from pandas DataFrame
df = ks.from_pandas(pdf)
for a in df.groupby('x'):
print(a)
Here is the error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-35-d4164d1f71e0> in <module>
----> 1 for a in df.groupby('x'):
2 print(a)
/opt/conda/lib/python3.7/site-packages/databricks/koalas/groupby.py in __getitem__(self, item)
2630 if self._as_index and is_name_like_value(item):
2631 return SeriesGroupBy(
-> 2632 self._kdf._kser_for(item if is_name_like_tuple(item) else (item,)),
2633 self._groupkeys,
2634 dropna=self._dropna,
/opt/conda/lib/python3.7/site-packages/databricks/koalas/frame.py in _kser_for(self, label)
721 Name: id, dtype: int64
722 """
--> 723 return self._ksers[label]
724
725 def _apply_series_op(self, op, should_resolve: bool = False):
KeyError: (0,)
Is this kind of group iteration possible in koalas? The koalas documentation kind of implies it is possible - https://koalas.readthedocs.io/en/latest/reference/groupby.html
Groupby iteration is not yet implemented:
https://github.com/databricks/koalas/issues/2014

IndexSlice on a datetime multindex not working, but doesn't seem different from a properly-working toy equivalent

I'm used to using IndexSlice on datetime indices. This is a toy equivalent of my multindex DataFrame and you can see the slicing works
#slicing works on a simple DateTime index
qf = pd.DataFrame(index=pd.date_range(start="1Jan2019",freq="d",periods=30))
qf.loc[idx['2019-1-15':None]] #works
#the same slicing works on a multindex
qf.reset_index(inplace=True)
qf['foo']="bar"
qf['other']=range(len(qf))
qf['filler']="egbdf"
qf.set_index(['index','foo', 'other'], inplace=True)
qf.loc[idx['2019-1-15':'2019-1-20',:,:],:] #wrks
qf.loc[idx['2019-1-15':None,'bar',:],:] #works
But something is going on with a my real DataFrame. I cannot see what the difference is.
xf.loc[idx['2019-5-1':'2019-6-1',"squat",:],:] # This works ok
xf.loc[idx['2019-5-1':None,"squat",:],:] # This fails
The error I get when I slice with a '2019-5-1':None is
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-280-b0dce8e9e337> in <module>
1 xf.loc[idx['2019-5-1':'2019-6-1',"squat",:],:] # This works ok
----> 2 xf.loc[idx['2019-5-1':None,"squat",:],:] # This fails
3 #xf
C:\ProgramData\Anaconda3\envs\nambu\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
1492 except (KeyError, IndexError, AttributeError):
1493 pass
-> 1494 return self._getitem_tuple(key)
1495 else:
1496 # we by definition only have the 0th axis
C:\ProgramData\Anaconda3\envs\nambu\lib\site-packages\pandas\core\indexing.py in _getitem_tuple(self, tup)
866 def _getitem_tuple(self, tup):
867 try:
--> 868 return self._getitem_lowerdim(tup)
869 except IndexingError:
870 pass
C:\ProgramData\Anaconda3\envs\nambu\lib\site-packages\pandas\core\indexing.py in _getitem_lowerdim(self, tup)
967 # we may have a nested tuples indexer here
968 if self._is_nested_tuple_indexer(tup):
--> 969 return self._getitem_nested_tuple(tup)
970
971 # we maybe be using a tuple to represent multiple dimensions here
C:\ProgramData\Anaconda3\envs\nambu\lib\site-packages\pandas\core\indexing.py in _getitem_nested_tuple(self, tup)
1046
1047 current_ndim = obj.ndim
-> 1048 obj = getattr(obj, self.name)._getitem_axis(key, axis=axis)
1049 axis += 1
1050
C:\ProgramData\Anaconda3\envs\nambu\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
1904 # nested tuple slicing
1905 if is_nested_tuple(key, labels):
-> 1906 locs = labels.get_locs(key)
1907 indexer = [slice(None)] * self.ndim
1908 indexer[axis] = locs
C:\ProgramData\Anaconda3\envs\nambu\lib\site-packages\pandas\core\indexes\multi.py in get_locs(self, seq)
2774 # a slice, include BOTH of the labels
2775 indexer = _update_indexer(_convert_to_indexer(
-> 2776 self._get_level_indexer(k, level=i, indexer=indexer)),
2777 indexer=indexer)
2778 else:
C:\ProgramData\Anaconda3\envs\nambu\lib\site-packages\pandas\core\indexes\multi.py in _get_level_indexer(self, key, level, indexer)
2635 # note that the stop ALREADY includes the stopped point (if
2636 # it was a string sliced)
-> 2637 return convert_indexer(start.start, stop.stop, step)
2638
2639 elif level > 0 or self.lexsort_depth == 0 or step is not None:
AttributeError: 'int' object has no attribute 'stop'
I cannot see any material difference between the toy index and the real index, and I cannot see how the error message results to passing None into the slicer.
========================================================
I figured out why it works/doesn't work in different examples.
The code works ok when the index is entirely dates. But if the index has datetimes in it, it fails.
#this index is solely dates, not dateTimes, and everything works
dt_index = pd.date_range(start="1jan2019",periods=100,freq="d")
zf = pd.DataFrame(index=dt_index)
zf['foo']=10
zf['bar']="squat"
zf['zaa']=range(len(dt_index))
zf.index.name="date"
zf = zf.reset_index().set_index(["date", "bar", "zaa"])
zf.loc[idx['2019-1-1':'2019-1-3',"squat",:],:] # This works ok
zf.loc[idx['2019-1-1':,"squat",:],:] # This works
zf.loc[idx['2019-1-1':None,'squat',:,:],:] # This works
The failing example:
dt_index = pd.date_range(start="1jan2019 00:15:33",periods=100,freq="h")
zf = pd.DataFrame(index=dt_index)
zf['foo']=10
zf['bar']="squat"
zf['zaa']=range(len(dt_index))
zf.index.name="date"
zf = zf.reset_index().set_index(["date", "bar", "zaa"])
zf.loc[idx['2019-1-1':'2019-1-3',"squat",:],:] # This works ok
#zf.loc[idx['2019-1-1':,"squat",:],:] # This fails AttributeError: 'int' object has no attribute 'stop'
#zf.loc[idx['2019-1-1':None,'squat',:,:],:] # AttributeError: 'int' object has no attribute 'stop'
This seems like a bug. According to this discussion, check line 2614-2637 of the multi.py of pandas package:
try:
if key.start is not None:
start = level_index.get_loc(key.start)
else:
start = 0
if key.stop is not None:
stop = level_index.get_loc(key.stop)
else:
stop = len(level_index) - 1
step = key.step
except KeyError:
# we have a partial slice (like looking up a partial date
# string)
start = stop = level_index.slice_indexer(key.start, key.stop,
key.step, kind='loc')
step = start.step
if isinstance(start, slice) or isinstance(stop, slice):
# we have a slice for start and/or stop
# a partial date slicer on a DatetimeIndex generates a slice
# note that the stop ALREADY includes the stopped point (if
# it was a string sliced)
return convert_indexer(start.start, stop.stop, step)
The stop would always be an int, because the endpoint is None. But the start is different for qf and xf. The datetime_index of qf has a resolution of 1day, and qf.index.levels[0].get_loc('2019-01-17') would be an 'int'. But the resolution of xf is 0.001S, and xf.index.levels[0].get_loc('2019-01-17') would be a slice, which result the calling of stop.stop, while the stop being an int.
As a work around, you can use a very large date instead of None:
xf.loc[idx['2019-5-1':'2222',"squat",:],:]

join two tuples with different index and different shape

I have two tuples which i like to have in one but i always get a lot of NaN's when I try to concat, merge or join them.
my tuple row looks like this :
and my tuple row_ges looks like this (shape: 5 rows, 6 columns):
I would like to have this in one dataframe like this shape (5 rows,301 columns):
i tried
result=row_ges+row
result_1=row_ges.append(row,ignore_index=True)
result_2 = pd.concat([row_ges,row], axis=1, ignore_index=True)
but i always get a shape like (10rows, 301 columns) and for result_2 i get an error code i don't understand:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-209-cdc656a8e828> in <module>()
4 result=row_ges+row
5 result_1=row_ges.append(row,ignore_index=True)
----> 6 result_2 = pd.concat([row_ges,row], axis=1, ignore_index=True)
7
8 #gesamt = pd.DataFrame()
/usr/local/lib/python3.6/site-packages/pandas/core/reshape/concat.py in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, sort, copy)
224 verify_integrity=verify_integrity,
225 copy=copy, sort=sort)
--> 226 return op.get_result()
227
228
/usr/local/lib/python3.6/site-packages/pandas/core/reshape/concat.py in get_result(self)
421 new_data = concatenate_block_managers(
422 mgrs_indexers, self.new_axes, concat_axis=self.axis,
--> 423 copy=self.copy)
424 if not self.copy:
425 new_data._consolidate_inplace()
/usr/local/lib/python3.6/site-packages/pandas/core/internals.py in concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy)
5423 blocks.append(b)
5424
-> 5425 return BlockManager(blocks, axes)
5426
5427
/usr/local/lib/python3.6/site-packages/pandas/core/internals.py in __init__(self, blocks, axes, do_integrity_check)
3280
3281 if do_integrity_check:
-> 3282 self._verify_integrity()
3283
3284 self._consolidate_check()
/usr/local/lib/python3.6/site-packages/pandas/core/internals.py in _verify_integrity(self)
3491 for block in self.blocks:
3492 if block._verify_integrity and block.shape[1:] != mgr_shape[1:]:
-> 3493 construction_error(tot_items, block.shape[1:], self.axes)
3494 if len(self.items) != tot_items:
3495 raise AssertionError('Number of manager items must equal union of '
/usr/local/lib/python3.6/site-packages/pandas/core/internals.py in construction_error(tot_items, block_shape, axes, e)
4841 raise ValueError("Empty data passed with indices specified.")
4842 raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
-> 4843 passed, implied))
4844
4845
ValueError: Shape of passed values is (301, 29), indices imply (301, 9)
row.reset_index(drop=True, inplace=True)
row_ges.reset_index(drop=True, inplace=True)
result = pd.concat([row_ges,row], axis=1)
The problem was the different index. With this code it is working.

How to resample a python pandas TimeSeries containing dytpe Decimal values?

I'm having a pandas Series object filled with decimal numbers of dtype Decimal. I'd like to use the new pandas 0.8 function to resample the decimal time series like this:
resampled = ts.resample('D', how = 'mean')
When trying this i get an "GroupByError: No numeric types to aggregate" error. I assume the problem is that np.mean is used internaly to resample the values and np.mean expects floats instead of Decimals.
Thanks to the help of this forum i managed to solve a similar question using groupBy and the apply function but i would love to also use the cool resample function.
How use the mean method on a pandas TimeSeries with Decimal type values?
Any idea how to solve this?
Here is the complete ipython session creating the error:
In [37]: from decimal import Decimal
In [38]: from pandas import *
In [39]: rng = date_range('1.1.2012',periods=48, freq='H')
In [40]: rnd = np.random.randn(len(rng))
In [41]: rnd_dec = [Decimal(x) for x in rnd]
In [42]: ts = Series(rnd_dec, index=rng)
In [43]: ts[0:3]
Out[43]:
2012-01-01 00:00:00 -0.1020591335576267189022559023214853368699550628
2012-01-01 01:00:00 0.99245713975437366283216533702216111123561859130
2012-01-01 02:00:00 1.80080710727195758558139004890108481049537658691
Freq: H
In [44]: type(ts[0])
Out[44]: decimal.Decimal
In [45]: ts.resample('D', how = 'mean')
---------------------------------------------------------------------------
GroupByError Traceback (most recent call last)
C:\Users\THM\Documents\Python\<ipython-input-45-09c898403ddd> in <module>()
----> 1 ts.resample('D', how = 'mean')
C:\Python27\lib\site-packages\pandas\core\generic.pyc in resample(self, rule, how, axis, fill_method, closed, label, convention, kind, loffset, l
imit, base)
187 fill_method=fill_method, convention=convention,
188 limit=limit, base=base)
--> 189 return sampler.resample(self)
190
191 def first(self, offset):
C:\Python27\lib\site-packages\pandas\tseries\resample.pyc in resample(self, obj)
65
66 if isinstance(axis, DatetimeIndex):
---> 67 rs = self._resample_timestamps(obj)
68 elif isinstance(axis, PeriodIndex):
69 offset = to_offset(self.freq)
C:\Python27\lib\site-packages\pandas\tseries\resample.pyc in _resample_timestamps(self, obj)
184 if len(grouper.binlabels) < len(axlabels) or self.how is not None:
185 grouped = obj.groupby(grouper, axis=self.axis)
--> 186 result = grouped.aggregate(self._agg_method)
187 else:
188 # upsampling shortcut
C:\Python27\lib\site-packages\pandas\core\groupby.pyc in aggregate(self, func_or_funcs, *args, **kwargs)
1215 """
1216 if isinstance(func_or_funcs, basestring):
-> 1217 return getattr(self, func_or_funcs)(*args, **kwargs)
1218
1219 if hasattr(func_or_funcs,'__iter__'):
C:\Python27\lib\site-packages\pandas\core\groupby.pyc in mean(self)
290 """
291 try:
--> 292 return self._cython_agg_general('mean')
293 except GroupByError:
294 raise
C:\Python27\lib\site-packages\pandas\core\groupby.pyc in _cython_agg_general(self, how)
376
377 if len(output) == 0:
--> 378 raise GroupByError('No numeric types to aggregate')
379
380 return self._wrap_aggregated_output(output, names)
GroupByError: No numeric types to aggregate
Any help is appreciated.
Thanks,
Thomas
I found the answer by myself. It is possible to provide a function to the 'how' argument of resample:
f = lambda x: Decimal(np.mean(x))
ts.resample('D', how = f)
I get the error for object type columns in DataFrame. I got around it by using
df.resample('D', method='ffill', how=lambda c: c[-1])

Categories