Transform pandas series of serialized json into pandas series of dicts - python

I have the following pandas series:
>>>df.A.head()
0 {"Date_": "2022-06-01T01:00:00+05:30", "submit...
1 {"Growth": [{"textField": "", "Change_Size": "...
2 {"submit": true, "HSI_Tag": "xyz...
3 {"submit": true, "HSI_Tag": "xyz...
4 {"submit": true, "roleList": "xy...
Name: A, dtype: object
Every item in the series is a serialized JSON
item. I would like to turn every item into a dictionary. I am trying to do the following, but I get an error:
for i in range(len(df.A)):
df.A.iloc[i] = json.loads(df.A.iloc[i])
The error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-27-9b4e8d4e6d76> in <module>
1 for i in range(len(df.A)):
----> 2 df.A.iloc[i] = json.loads(df.A.iloc[i])
C:\ANACONDA3\lib\site-packages\pandas\core\indexing.py in __setitem__(self, key, value)
188 key = com.apply_if_callable(key, self.obj)
189 indexer = self._get_setitem_indexer(key)
--> 190 self._setitem_with_indexer(indexer, value)
191
192 def _validate_key(self, key, axis):
C:\ANACONDA3\lib\site-packages\pandas\core\indexing.py in _setitem_with_indexer(self, indexer, value)
640 # setting for extensionarrays that store dicts. Need to decide
641 # if it's worth supporting that.
--> 642 value = self._align_series(indexer, Series(value))
643
644 elif isinstance(value, ABCDataFrame):
C:\ANACONDA3\lib\site-packages\pandas\core\indexing.py in _align_series(self, indexer, ser, multiindex_indexer)
774
775 elif is_scalar(indexer):
--> 776 ax = self.obj._get_axis(1)
777
778 if ser.index.equals(ax):
C:\ANACONDA3\lib\site-packages\pandas\core\generic.py in _get_axis(self, axis)
376
377 def _get_axis(self, axis):
--> 378 name = self._get_axis_name(axis)
379 return getattr(self, name)
380
C:\ANACONDA3\lib\site-packages\pandas\core\generic.py in _get_axis_name(cls, axis)
373 pass
374 raise ValueError('No axis named {0} for object type {1}'
--> 375 .format(axis, type(cls)))
376
377 def _get_axis(self, axis):
ValueError: No axis named 1 for object type <class 'type'>
How can I fix it?

I managed to do it eventually with apply and a lambda like this:
df.A = df.A.apply(lambda x: json.loads(x))

Related

Numpy - Length of values (1191) does not match length of index (1250)

would appreciate any help with this, I'm getting and error of
ValueError: Length of values (1191) does not match length of index (1250).
I don't understand where Numpy is getting the length of 1191 from ?, I've created a Dataframe of 1250, and I'm trying to assign future['floor'] to it based on conditions, future['cap'] works fine, but that is Pandas, whereas 'Floor' is using NP, but I don't understand why NP would cause this error. Thanks for your help. Gav
future = m.make_future_dataframe(periods=1250,freq='D', include_history=False)
conditions = [
g['Operator'] == 100151,
g['Operator'] == 20137,
g['Operator'] == 20147,
]
values = [
g['y'].mean()/2,
g['y'].mean()/2,
g['y'].mean()/2
]
future['floor'] = np.select(conditions,values)
future['cap'] = max(g['y'])*1.25
forecast = m.predict(future)
ValueError Traceback (most recent call last)
<ipython-input-184-a698f789f6b3> in <module>
----> 1 fout = df.groupby('Operator').apply(forecast_data)
~\Anaconda3\lib\site-packages\pandas\core\groupby\groupby.py in apply(self, func, *args, **kwargs)
892 with option_context("mode.chained_assignment", None):
893 try:
--> 894 result = self._python_apply_general(f, self._selected_obj)
895 except TypeError:
896 # gh-20949
~\Anaconda3\lib\site-packages\pandas\core\groupby\groupby.py in _python_apply_general(self, f, data)
926 data after applying f
927 """
--> 928 keys, values, mutated = self.grouper.apply(f, data, self.axis)
929
930 return self._wrap_applied_output(
~\Anaconda3\lib\site-packages\pandas\core\groupby\ops.py in apply(self, f, data, axis)
236 # group might be modified
237 group_axes = group.axes
--> 238 res = f(group)
239 if not _is_indexed_like(res, group_axes, axis):
240 mutated = True
<ipython-input-183-f88148e0e94e> in forecast_data(g)
42 g['y'].mean()/2
43 ]
---> 44 future['floor'] = np.select(conditions,values)
45 future['cap'] = max(g['y'])*1.25
46 forecast = m.predict(future)
~\Anaconda3\lib\site-packages\pandas\core\frame.py in __setitem__(self, key, value)
3161 else:
3162 # set column
-> 3163 self._set_item(key, value)
3164
3165 def _setitem_slice(self, key: slice, value):
~\Anaconda3\lib\site-packages\pandas\core\frame.py in _set_item(self, key, value)
3240 """
3241 self._ensure_valid_index(value)
-> 3242 value = self._sanitize_column(key, value)
3243 NDFrame._set_item(self, key, value)
3244
~\Anaconda3\lib\site-packages\pandas\core\frame.py in _sanitize_column(self, key, value, broadcast)
3897
3898 # turn me into an ndarray
-> 3899 value = sanitize_index(value, self.index)
3900 if not isinstance(value, (np.ndarray, Index)):
3901 if isinstance(value, list) and len(value) > 0:
~\Anaconda3\lib\site-packages\pandas\core\internals\construction.py in sanitize_index(data, index)
749 """
750 if len(data) != len(index):
--> 751 raise ValueError(
752 "Length of values "
753 f"({len(data)}) "
ValueError: Length of values (1191) does not match length of index (1250)

Is there a way I can read the JSON file saved by dataframe.to_json(orient='table')?

I updated to pandas 0.20.1 recently and I tried to use the new feature of to_json(orient='table')
import pandas as pd
pd.__version__
# '0.20.1'
a = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6]})
a.to_json('a.json', orient='table')
But how can I read this JSON file to DataFrame?
I tried pd.read_json('a.json', orient='table') but it raised ValueError
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-22-7527b25107ef> in <module>()
----> 1 pd.read_json('a.json', orient='table')
C:\Anaconda3\lib\site-packages\pandas\io\json\json.py in read_json(path_or_buf, orient, typ, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit, encoding, lines)
352 obj = FrameParser(json, orient, dtype, convert_axes, convert_dates,
353 keep_default_dates, numpy, precise_float,
--> 354 date_unit).parse()
355
356 if typ == 'series' or obj is None:
C:\Anaconda3\lib\site-packages\pandas\io\json\json.py in parse(self)
420
421 else:
--> 422 self._parse_no_numpy()
423
424 if self.obj is None:
C:\Anaconda3\lib\site-packages\pandas\io\json\json.py in _parse_no_numpy(self)
650 else:
651 self.obj = DataFrame(
--> 652 loads(json, precise_float=self.precise_float), dtype=None)
653
654 def _process_converter(self, f, filt=None):
C:\Anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
273 dtype=dtype, copy=copy)
274 elif isinstance(data, dict):
--> 275 mgr = self._init_dict(data, index, columns, dtype=dtype)
276 elif isinstance(data, ma.MaskedArray):
277 import numpy.ma.mrecords as mrecords
C:\Anaconda3\lib\site-packages\pandas\core\frame.py in _init_dict(self, data, index, columns, dtype)
409 arrays = [data[k] for k in keys]
410
--> 411 return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
412
413 def _init_ndarray(self, values, index, columns, dtype=None, copy=False):
C:\Anaconda3\lib\site-packages\pandas\core\frame.py in _arrays_to_mgr(arrays, arr_names, index, columns, dtype)
5592 # figure out the index, if necessary
5593 if index is None:
-> 5594 index = extract_index(arrays)
5595 else:
5596 index = _ensure_index(index)
C:\Anaconda3\lib\site-packages\pandas\core\frame.py in extract_index(data)
5643
5644 if have_dicts:
-> 5645 raise ValueError('Mixing dicts with non-Series may lead to '
5646 'ambiguous ordering.')
5647
ValueError: Mixing dicts with non-Series may lead to ambiguous ordering.
So is there a way i can read that JSON file? Thanks in advance.
PS: the JSON file looks like this:
{"schema": {"pandas_version":"0.20.0","fields":[{"type":"integer","name":"index"},{"type":"integer","name":"a"},{"type":"integer","name":"b"}],"primaryKey":["index"]}, "data": [{"index":0,"a":1,"b":4},{"index":1,"a":2,"b":5},{"index":2,"a":3,"b":6}]}
Apparently the new method outputs some metadata with the dataset into json such as the pandas version. Hence, consider using the built-in json module to read in this nested object to extract the value at data key:
import json
...
with open('a.json', 'r') as f:
json_obj = json.loads(f.read())
df = pd.DataFrame(json_obj['data']).set_index('index')
df.index.name = None
print(df)
# a b
# 0 1 4
# 1 2 5
# 2 3 6
Should you intend to use type and name, run dictionary and list comprehension on those parts in nested json. Though here, integer has to be sliced to int. The dtype argument cannot be used since names are not saved until after the step:
with open('a.json', 'r') as f:
json_obj = json.loads(f.read())
df = pd.DataFrame(json_obj['data'], columns=[t['name']
for t in json_obj['schema']['fields']])
df = df.astype(dtype={t['name']: t['type'][:3]
for t in json_obj['schema']['fields']}).set_index('index')
df.index.name = None
print(df)
# a b
# 0 1 4
# 1 2 5
# 2 3 6
Here is a function I have developed from Parfait answer:
def table_to_df(table):
df = pd.DataFrame(table['data'],
columns=[t['name'] for t in table['schema']['fields']])
for t in table['schema']['fields']:
if t['type'] == "datetime":
df[t['name']] = pd.to_datetime(df[t['name']], infer_datetime_format=True)
df.set_index(table['schema']['primaryKey'], inplace=True)
return df

python, change json format list to data frame

*js=['{"id":42352,"user_id":11770,"recipient_id":29936,"exchange_rate_list_id":39298,"send_amount_cents":"73860000","send_amount_currency":"KRW","commission_cents":"3000000","commission_currency":"KRW","receive_amount_cents":"3000000","receive_amount_currency":"PHP","save_amount_cents":"3336382","save_amount_currency":"KRW","status":0,"created_at":"2016-10-28T09:10:26.751Z","updated_at":"2016-10-28T09:10:26.751Z","transfer_list_id":null,"purpose":"living_expenses","external_fee_cents":"708000","external_fee_currency":"KRW","sender_country":"KR"}',
'{"id":42362,"user_id":995,"recipient_id":13068,"exchange_rate_list_id":39306,"send_amount_cents":"37500000","send_amount_currency":"KRW","commission_cents":"1875000","commission_currency":"KRW","receive_amount_cents":"1509500","receive_amount_currency":"PHP","save_amount_cents":"3411736","save_amount_currency":"KRW","status":0,"created_at":"2016-10-28T10:22:35.831Z","updated_at":"2016-10-28T10:22:35.831Z","transfer_list_id":null,"purpose":"living_expenses","external_fee_cents":"472000","external_fee_currency":"KRW","sender_country":"KR"}',
'{"id":42351,"user_id":3563,"recipient_id":29935,"exchange_rate_list_id":39298,"send_amount_cents":"8703000","send_amount_currency":"KRW","commission_cents":"436000","commission_currency":"KRW","receive_amount_cents":"350000","receive_amount_currency":"PHP","save_amount_cents":"4413495","save_amount_currency":"KRW","status":0,"created_at":"2016-10-28T09:08:41.488Z","updated_at":"2016-10-28T09:08:41.488Z","transfer_list_id":null,"purpose":"living_expenses","external_fee_cents":"283000","external_fee_currency":"KRW","sender_country":"KR"}',
'{"id":42359,"user_id":2657,"recipient_id":27757,"exchange_rate_list_id":39302,"send_amount_cents":"9937000","send_amount_currency":"KRW","commission_cents":"497000","commission_currency":"KRW","receive_amount_cents":"400000","receive_amount_currency":"PHP","save_amount_cents":"4369830","save_amount_currency":"KRW","status":0,"created_at":"2016-10-28T09:47:35.891Z","updated_at":"2016-10-28T09:47:35.891Z","transfer_list_id":null,"purpose":"living_expenses","external_fee_cents":"283000","external_fee_currency":"KRW","sender_country":"KR"}',
'{"id":42341,"user_id":4472,"recipient_id":29931,"exchange_rate_list_id":39290,"send_amount_cents":"49727000","send_amount_currency":"KRW","commission_cents":"2487000","commission_currency":"KRW","receive_amount_cents":"2000000","receive_amount_currency":"PHP","save_amount_cents":"2987161","save_amount_currency":"KRW","status":0,"created_at":"2016-10-28T07:45:21.924Z","updated_at":"2016-10-28T07:45:21.924Z","transfer_list_id":null,"purpose":"living_expenses","external_fee_cents":"472000","external_fee_currency":"KRW","sender_country":"KR"}',
'{"id":42342,"user_id":4471,"recipient_id":17086,"exchange_rate_list_id":39292,"send_amount_cents":"25000000","send_amount_currency":"KRW","commission_cents":"1250000","commission_currency":"KRW","receive_amount_cents":"1005500","receive_amount_currency":"PHP","save_amount_cents":"3846653","save_amount_currency":"KRW","status":0,"created_at":"2016-10-28T08:07:36.936Z","updated_at":"2016-10-28T08:07:36.936Z","transfer_list_id":null,"purpose":"living_expenses","external_fee_cents":"354000","external_fee_currency":"KRW","sender_country":"KR"}',
'{"id":42343,"user_id":4471,"recipient_id":12681,"exchange_rate_list_id":39292,"send_amount_cents":"6000000","send_amount_currency":"KRW","commission_cents":"300000","commission_currency":"KRW","receive_amount_cents":"241300","receive_amount_currency":"PHP","save_amount_cents":"4506244","save_amount_currency":"KRW","status":0,"created_at":"2016-10-28T08:09:24.871Z","updated_at":"2016-10-28T08:09:24.871Z","transfer_list_id":null,"purpose":"living_expenses","external_fee_cents":"236000","external_fee_currency":"KRW","sender_country":"KR"}',
'{"id":42198,"user_id":9950,"recipient_id":29834,"exchange_rate_list_id":39165,"send_amount_cents":"7453000","send_amount_currency":"KRW","commission_cents":"373000","commission_currency":"KRW","receive_amount_cents":"300000","receive_amount_currency":"PHP","save_amount_cents":"4451416","save_amount_currency":"KRW","status":0,"created_at":"2016-10-27T10:58:31.712Z","updated_at":"2016-10-27T10:58:31.712Z","transfer_list_id":null,"purpose":"living_expenses","external_fee_cents":"0","external_fee_currency":"KRW","sender_country":"KR"}',
'{"id":42199,"user_id":2001,"recipient_id":29835,"exchange_rate_list_id":39166,"send_amount_cents":"4969000","send_amount_currency":"KRW","commission_cents":"249000","commission_currency":"KRW","receive_amount_cents":"200000","receive_amount_currency":"PHP","save_amount_cents":"4537501","save_amount_currency":"KRW","status":0,"created_at":"2016-10-27T11:00:02.677Z","updated_at":"2016-10-27T11:00:02.677Z","transfer_list_id":null,"purpose":"living_expenses","external_fee_cents":"188000","external_fee_currency":"KRW","sender_country":"KR"}']*
I have a list of string with json format named js.
But when I do
pd.read_json(js)
I get an error saying
TypeError Traceback (most recent call last)
in ()
----> 1 pd.read_json(js)
//anaconda/lib/python2.7/site-packages/pandas/io/json.pyc in >read_json(path_or_buf, orient, typ, dtype, convert_axes, convert_dates, >keep_default_dates, numpy, precise_float, date_unit)
209 obj = FrameParser(json, orient, dtype, convert_axes, >convert_dates,
210 keep_default_dates, numpy, precise_float,
--> 211 date_unit).parse()
212
213 if typ == 'series' or obj is None:
//anaconda/lib/python2.7/site-packages/pandas/io/json.pyc in parse(self)
277
278 else:
--> 279 self._parse_no_numpy()
280
281 if self.obj is None:
//anaconda/lib/python2.7/site-packages/pandas/io/json.pyc in >_parse_no_numpy(self)
494 if orient == "columns":
495 self.obj = DataFrame(
--> 496 loads(json, precise_float=self.precise_float), >dtype=None)
497 elif orient == "split":
498 decoded = dict((str(k), v)
TypeError: Expected String or Unicode
I got it to work by doing
df = pd.DataFrame()
for j in js:
data = pd.read_json(j, typ='list')
df=df.append(data,ignore_index=True)
which took forever to execute.
My question is, if I am allowed to read the list one at a time and append that to an empty dataframe, why cant I just read the whole list without getting an error? Is there any way to fix this problem ? thanks.
import pandas as pd
js=['{"id":42352,"user_id":11770,"recipient_id":29936,"exchange_rate_list_id":39298,"send_amount_cents":"73860000","send_amount_currency":"KRW","commission_cents":"3000000","commission_currency":"KRW","receive_amount_cents":"3000000","receive_amount_currency":"PHP","save_amount_cents":"3336382","save_amount_currency":"KRW","status":0,"created_at":"2016-10-28T09:10:26.751Z","updated_at":"2016-10-28T09:10:26.751Z","transfer_list_id":null,"purpose":"living_expenses","external_fee_cents":"708000","external_fee_currency":"KRW","sender_country":"KR"}',
'{"id":42362,"user_id":995,"recipient_id":13068,"exchange_rate_list_id":39306,"send_amount_cents":"37500000","send_amount_currency":"KRW","commission_cents":"1875000","commission_currency":"KRW","receive_amount_cents":"1509500","receive_amount_currency":"PHP","save_amount_cents":"3411736","save_amount_currency":"KRW","status":0,"created_at":"2016-10-28T10:22:35.831Z","updated_at":"2016-10-28T10:22:35.831Z","transfer_list_id":null,"purpose":"living_expenses","external_fee_cents":"472000","external_fee_currency":"KRW","sender_country":"KR"}',
'{"id":42351,"user_id":3563,"recipient_id":29935,"exchange_rate_list_id":39298,"send_amount_cents":"8703000","send_amount_currency":"KRW","commission_cents":"436000","commission_currency":"KRW","receive_amount_cents":"350000","receive_amount_currency":"PHP","save_amount_cents":"4413495","save_amount_currency":"KRW","status":0,"created_at":"2016-10-28T09:08:41.488Z","updated_at":"2016-10-28T09:08:41.488Z","transfer_list_id":null,"purpose":"living_expenses","external_fee_cents":"283000","external_fee_currency":"KRW","sender_country":"KR"}',
'{"id":42359,"user_id":2657,"recipient_id":27757,"exchange_rate_list_id":39302,"send_amount_cents":"9937000","send_amount_currency":"KRW","commission_cents":"497000","commission_currency":"KRW","receive_amount_cents":"400000","receive_amount_currency":"PHP","save_amount_cents":"4369830","save_amount_currency":"KRW","status":0,"created_at":"2016-10-28T09:47:35.891Z","updated_at":"2016-10-28T09:47:35.891Z","transfer_list_id":null,"purpose":"living_expenses","external_fee_cents":"283000","external_fee_currency":"KRW","sender_country":"KR"}',
'{"id":42341,"user_id":4472,"recipient_id":29931,"exchange_rate_list_id":39290,"send_amount_cents":"49727000","send_amount_currency":"KRW","commission_cents":"2487000","commission_currency":"KRW","receive_amount_cents":"2000000","receive_amount_currency":"PHP","save_amount_cents":"2987161","save_amount_currency":"KRW","status":0,"created_at":"2016-10-28T07:45:21.924Z","updated_at":"2016-10-28T07:45:21.924Z","transfer_list_id":null,"purpose":"living_expenses","external_fee_cents":"472000","external_fee_currency":"KRW","sender_country":"KR"}',
'{"id":42342,"user_id":4471,"recipient_id":17086,"exchange_rate_list_id":39292,"send_amount_cents":"25000000","send_amount_currency":"KRW","commission_cents":"1250000","commission_currency":"KRW","receive_amount_cents":"1005500","receive_amount_currency":"PHP","save_amount_cents":"3846653","save_amount_currency":"KRW","status":0,"created_at":"2016-10-28T08:07:36.936Z","updated_at":"2016-10-28T08:07:36.936Z","transfer_list_id":null,"purpose":"living_expenses","external_fee_cents":"354000","external_fee_currency":"KRW","sender_country":"KR"}',
'{"id":42343,"user_id":4471,"recipient_id":12681,"exchange_rate_list_id":39292,"send_amount_cents":"6000000","send_amount_currency":"KRW","commission_cents":"300000","commission_currency":"KRW","receive_amount_cents":"241300","receive_amount_currency":"PHP","save_amount_cents":"4506244","save_amount_currency":"KRW","status":0,"created_at":"2016-10-28T08:09:24.871Z","updated_at":"2016-10-28T08:09:24.871Z","transfer_list_id":null,"purpose":"living_expenses","external_fee_cents":"236000","external_fee_currency":"KRW","sender_country":"KR"}',
'{"id":42198,"user_id":9950,"recipient_id":29834,"exchange_rate_list_id":39165,"send_amount_cents":"7453000","send_amount_currency":"KRW","commission_cents":"373000","commission_currency":"KRW","receive_amount_cents":"300000","receive_amount_currency":"PHP","save_amount_cents":"4451416","save_amount_currency":"KRW","status":0,"created_at":"2016-10-27T10:58:31.712Z","updated_at":"2016-10-27T10:58:31.712Z","transfer_list_id":null,"purpose":"living_expenses","external_fee_cents":"0","external_fee_currency":"KRW","sender_country":"KR"}',
'{"id":42199,"user_id":2001,"recipient_id":29835,"exchange_rate_list_id":39166,"send_amount_cents":"4969000","send_amount_currency":"KRW","commission_cents":"249000","commission_currency":"KRW","receive_amount_cents":"200000","receive_amount_currency":"PHP","save_amount_cents":"4537501","save_amount_currency":"KRW","status":0,"created_at":"2016-10-27T11:00:02.677Z","updated_at":"2016-10-27T11:00:02.677Z","transfer_list_id":null,"purpose":"living_expenses","external_fee_cents":"188000","external_fee_currency":"KRW","sender_country":"KR"}']
a = pd.read_json('[{}]'.format(','.join(js)))
print(a)

KeyError when using s.loc and s.first_valid_index()

I have data similar to this post: pandas: Filling missing values within a group
That is, I have data in a number of observation sessions, and there is a focal individual for each session. That focal individual is only noted once, but I want to fill in the focal ID data for each line during that session. So, the data look something like this:
Focal Session
0 NaN 1
1 50101 1
2 NaN 1
3 NaN 2
4 50408 2
5 NaN 2
Based on the post linked above, I was using this code:
g = data.groupby('Session')
g['Focal'].transform(lambda s: s.loc[s.first_valid_index()])
But this returns a KeyError (specifically, KeyError:None). According to the .loc documentation, KeyErrors can result when the data isn't found. So, I've checked and while I have 152 sessions, I only have 150 non-null data points in the Focal column. Before I decide to manually search my data for which of the sessions is missing a Focal ID, I have two questions:
I am very much a beginner. So is this a reasonable explanation for why I am getting a KeyError?
If it is reasonable, is there a way to figure out which Session is missing Focal ID data, that will save me from manually looking through the data?
Output here:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-330-0e4f27aa7e14> in <module>()
----> 1 data['Focal'] = g['Focal'].transform(lambda s: s.loc[s.first_valid_index()])
2 g['Focal'].transform(lambda s: s.loc[s.first_valid_index()])
//anaconda/lib/python2.7/site-packages/pandas/core/groupby.pyc in transform(self, func, *args, **kwargs)
1540 for name, group in self:
1541 object.__setattr__(group, 'name', name)
-> 1542 res = wrapper(group)
1543 # result[group.index] = res
1544 indexer = self.obj.index.get_indexer(group.index)
//anaconda/lib/python2.7/site-packages/pandas/core/groupby.pyc in <lambda>(x)
1536 wrapper = lambda x: getattr(x, func)(*args, **kwargs)
1537 else:
-> 1538 wrapper = lambda x: func(x, *args, **kwargs)
1539
1540 for name, group in self:
<ipython-input-330-0e4f27aa7e14> in <lambda>(s)
----> 1 data['Focal'] = g['Focal'].transform(lambda s: s.loc[s.first_valid_index()])
2 g['Focal'].transform(lambda s: s.loc[s.first_valid_index()])
//anaconda/lib/python2.7/site-packages/pandas/core/indexing.pyc in __getitem__(self, key)
669 return self._getitem_tuple(key)
670 else:
--> 671 return self._getitem_axis(key, axis=0)
672
673 def _getitem_axis(self, key, axis=0):
//anaconda/lib/python2.7/site-packages/pandas/core/indexing.pyc in _getitem_axis(self, key, axis)
756 return self._getitem_iterable(key, axis=axis)
757 else:
--> 758 return self._get_label(key, axis=axis)
759
760 class _iLocIndexer(_LocationIndexer):
//anaconda/lib/python2.7/site-packages/pandas/core/indexing.pyc in _get_label(self, label, axis)
58 return self.obj._xs(label, axis=axis, copy=False)
59 except Exception:
---> 60 return self.obj._xs(label, axis=axis, copy=True)
61
62 def _get_loc(self, key, axis=0):
//anaconda/lib/python2.7/site-packages/pandas/core/series.pyc in _xs(self, key, axis, level, copy)
570
571 def _xs(self, key, axis=0, level=None, copy=True):
--> 572 return self.__getitem__(key)
573
574 def _ixs(self, i, axis=0):
//anaconda/lib/python2.7/site-packages/pandas/core/series.pyc in __getitem__(self, key)
611 def __getitem__(self, key):
612 try:
--> 613 return self.index.get_value(self, key)
614 except InvalidIndexError:
615 pass
//anaconda/lib/python2.7/site-packages/pandas/core/index.pyc in get_value(self, series, key)
761 """
762 try:
--> 763 return self._engine.get_value(series, key)
764 except KeyError, e1:
765 if len(self) > 0 and self.inferred_type == 'integer':
//anaconda/lib/python2.7/site-packages/pandas/index.so in pandas.index.IndexEngine.get_value (pandas/index.c:2565)()
//anaconda/lib/python2.7/site-packages/pandas/index.so in pandas.index.IndexEngine.get_value (pandas/index.c:2380)()
//anaconda/lib/python2.7/site-packages/pandas/index.so in pandas.index.IndexEngine.get_loc (pandas/index.c:3166)()
KeyError: None
The problem is that first_valid_index returns None if there are no valid values (some groups in your DataFrame are all NaN):
In [1]: s = pd.Series([np.nan])
In [2]: s.first_valid_index() # None
Now, loc throws an error because there is no index None:
In [3]: s.loc[s.first_valid_index()]
KeyError: None
What do you want your code to do in this particular case? ...
If you wanted it to be NaN, you could backfill and then take the first element:
g['Focal'].transform(lambda s: s.bfill().iloc[0])
If you want to fix the problem that some groups contains only Nan you could do the following:
g = data.groupby('Session')
g['Focal'].transform(lambda s: 'No values to aggregate' if pd.isnull(s).all() == True else s.loc[s.first_valid_index()])
df['Focal'] = g['Focal'].transform(lambda s: 'No values to aggregate' if pd.isnull(s).all() == True else s.loc[s.first_valid_index()])
In this way you input 'No Values to aggregate' (or whatever you want) when the program find all Nan for a particular group, instead of blocking the execution to return an error.
Hope this helps :)
Federico

Pandas duplicate datetimeindex entries lead to odd exception

Let's take the following contrived example where I create a DataFrame and then make a DatetimeIndex using a column with duplicate entries. I then place this DataFrame into a Panel and then attempt to iterate over the major axis.
import pandas as pd
import datetime as dt
a = [1371215933513120, 1371215933513121, 1371215933513122, 1371215933513122]
b = [1,2,3,4]
df = pd.DataFrame({'a':a, 'b':b, 'c':[dt.datetime.fromtimestamp(t/1000000.) for t in a]})
df.index=pd.DatetimeIndex(df['c'])
d = OrderedDict()
d['x'] = df
p = pd.Panel(d)
for y in p.major_axis:
print y
print p.major_xs(y)
This leads to the following output:
2013-06-14 15:18:53.513120
x
a 1371215933513120
b 1
c 2013-06-14 15:18:53.513120
2013-06-14 15:18:53.513121
x
a 1371215933513121
b 2
c 2013-06-14 15:18:53.513121
2013-06-14 15:18:53.513122
Followed by a rather cryptic (to me) error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-35-045aaae5a074> in <module>()
13 for y in p.major_axis:
14 print y
---> 15 print p.major_xs(y)
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/frame.py in __str__(self)
667 if py3compat.PY3:
668 return self.__unicode__()
--> 669 return self.__bytes__()
670
671 def __bytes__(self):
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/frame.py in __bytes__(self)
677 """
678 encoding = com.get_option("display.encoding")
--> 679 return self.__unicode__().encode(encoding, 'replace')
680
681 def __unicode__(self):
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/frame.py in __unicode__(self)
692 # This needs to compute the entire repr
693 # so don't do it unless rownum is bounded
--> 694 fits_horizontal = self._repr_fits_horizontal_()
695
696 if fits_vertical and fits_horizontal:
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/frame.py in _repr_fits_horizontal_(self)
652 d=d.iloc[:min(max_rows, height,len(d))]
653
--> 654 d.to_string(buf=buf)
655 value = buf.getvalue()
656 repr_width = max([len(l) for l in value.split('\n')])
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/frame.py in to_string(self, buf, columns, col_space, colSpace, header, index, na_rep, formatters, float_format, sparsify, nanRep, index_names, justify, force_unicode, line_width)
1489 header=header, index=index,
1490 line_width=line_width)
-> 1491 formatter.to_string()
1492
1493 if buf is None:
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/format.py in to_string(self, force_unicode)
312 text = info_line
313 else:
--> 314 strcols = self._to_str_columns()
315 if self.line_width is None:
316 text = adjoin(1, *strcols)
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/format.py in _to_str_columns(self)
265 for i, c in enumerate(self.columns):
266 if self.header:
--> 267 fmt_values = self._format_col(i)
268 cheader = str_columns[i]
269
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/format.py in _format_col(self, i)
403 float_format=self.float_format,
404 na_rep=self.na_rep,
--> 405 space=self.col_space)
406
407 def to_html(self, classes=None):
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/format.py in format_array(values, formatter, float_format, na_rep, digits, space, justify)
1319 justify=justify)
1320
-> 1321 return fmt_obj.get_result()
1322
1323
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/format.py in get_result(self)
1335
1336 def get_result(self):
-> 1337 fmt_values = self._format_strings()
1338 return _make_fixed_width(fmt_values, self.justify)
1339
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/format.py in _format_strings(self)
1362
1363 print "vals:", vals
-> 1364 is_float = lib.map_infer(vals, com.is_float) & notnull(vals)
1365 leading_space = is_float.any()
1366
ValueError: operands could not be broadcast together with shapes (2) (2,3)
Now, having explained that I'm creating an index with duplicate entries, the source of the error is clear. Without having known that, however, it would have been more difficult (again, for a novice like me) to figure out why this Exception was popping up.
This leads me to a few questions.
Is this really the expected behavior of pandas? Is it forbidden to create an index with duplicate entries, or is it just forbidden to iterate over them?
If it's forbidden to create such an index, then shouldn't an exception be raised when initially creating it?
If the iteration is somehow incorrect, shouldn't the error be more informative?
Am I doing something wrong?

Categories