Numpy - Length of values (1191) does not match length of index (1250) - python

would appreciate any help with this, I'm getting and error of
ValueError: Length of values (1191) does not match length of index (1250).
I don't understand where Numpy is getting the length of 1191 from ?, I've created a Dataframe of 1250, and I'm trying to assign future['floor'] to it based on conditions, future['cap'] works fine, but that is Pandas, whereas 'Floor' is using NP, but I don't understand why NP would cause this error. Thanks for your help. Gav
future = m.make_future_dataframe(periods=1250,freq='D', include_history=False)
conditions = [
g['Operator'] == 100151,
g['Operator'] == 20137,
g['Operator'] == 20147,
]
values = [
g['y'].mean()/2,
g['y'].mean()/2,
g['y'].mean()/2
]
future['floor'] = np.select(conditions,values)
future['cap'] = max(g['y'])*1.25
forecast = m.predict(future)
ValueError Traceback (most recent call last)
<ipython-input-184-a698f789f6b3> in <module>
----> 1 fout = df.groupby('Operator').apply(forecast_data)
~\Anaconda3\lib\site-packages\pandas\core\groupby\groupby.py in apply(self, func, *args, **kwargs)
892 with option_context("mode.chained_assignment", None):
893 try:
--> 894 result = self._python_apply_general(f, self._selected_obj)
895 except TypeError:
896 # gh-20949
~\Anaconda3\lib\site-packages\pandas\core\groupby\groupby.py in _python_apply_general(self, f, data)
926 data after applying f
927 """
--> 928 keys, values, mutated = self.grouper.apply(f, data, self.axis)
929
930 return self._wrap_applied_output(
~\Anaconda3\lib\site-packages\pandas\core\groupby\ops.py in apply(self, f, data, axis)
236 # group might be modified
237 group_axes = group.axes
--> 238 res = f(group)
239 if not _is_indexed_like(res, group_axes, axis):
240 mutated = True
<ipython-input-183-f88148e0e94e> in forecast_data(g)
42 g['y'].mean()/2
43 ]
---> 44 future['floor'] = np.select(conditions,values)
45 future['cap'] = max(g['y'])*1.25
46 forecast = m.predict(future)
~\Anaconda3\lib\site-packages\pandas\core\frame.py in __setitem__(self, key, value)
3161 else:
3162 # set column
-> 3163 self._set_item(key, value)
3164
3165 def _setitem_slice(self, key: slice, value):
~\Anaconda3\lib\site-packages\pandas\core\frame.py in _set_item(self, key, value)
3240 """
3241 self._ensure_valid_index(value)
-> 3242 value = self._sanitize_column(key, value)
3243 NDFrame._set_item(self, key, value)
3244
~\Anaconda3\lib\site-packages\pandas\core\frame.py in _sanitize_column(self, key, value, broadcast)
3897
3898 # turn me into an ndarray
-> 3899 value = sanitize_index(value, self.index)
3900 if not isinstance(value, (np.ndarray, Index)):
3901 if isinstance(value, list) and len(value) > 0:
~\Anaconda3\lib\site-packages\pandas\core\internals\construction.py in sanitize_index(data, index)
749 """
750 if len(data) != len(index):
--> 751 raise ValueError(
752 "Length of values "
753 f"({len(data)}) "
ValueError: Length of values (1191) does not match length of index (1250)

Related

Transform pandas series of serialized json into pandas series of dicts

I have the following pandas series:
>>>df.A.head()
0 {"Date_": "2022-06-01T01:00:00+05:30", "submit...
1 {"Growth": [{"textField": "", "Change_Size": "...
2 {"submit": true, "HSI_Tag": "xyz...
3 {"submit": true, "HSI_Tag": "xyz...
4 {"submit": true, "roleList": "xy...
Name: A, dtype: object
Every item in the series is a serialized JSON
item. I would like to turn every item into a dictionary. I am trying to do the following, but I get an error:
for i in range(len(df.A)):
df.A.iloc[i] = json.loads(df.A.iloc[i])
The error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-27-9b4e8d4e6d76> in <module>
1 for i in range(len(df.A)):
----> 2 df.A.iloc[i] = json.loads(df.A.iloc[i])
C:\ANACONDA3\lib\site-packages\pandas\core\indexing.py in __setitem__(self, key, value)
188 key = com.apply_if_callable(key, self.obj)
189 indexer = self._get_setitem_indexer(key)
--> 190 self._setitem_with_indexer(indexer, value)
191
192 def _validate_key(self, key, axis):
C:\ANACONDA3\lib\site-packages\pandas\core\indexing.py in _setitem_with_indexer(self, indexer, value)
640 # setting for extensionarrays that store dicts. Need to decide
641 # if it's worth supporting that.
--> 642 value = self._align_series(indexer, Series(value))
643
644 elif isinstance(value, ABCDataFrame):
C:\ANACONDA3\lib\site-packages\pandas\core\indexing.py in _align_series(self, indexer, ser, multiindex_indexer)
774
775 elif is_scalar(indexer):
--> 776 ax = self.obj._get_axis(1)
777
778 if ser.index.equals(ax):
C:\ANACONDA3\lib\site-packages\pandas\core\generic.py in _get_axis(self, axis)
376
377 def _get_axis(self, axis):
--> 378 name = self._get_axis_name(axis)
379 return getattr(self, name)
380
C:\ANACONDA3\lib\site-packages\pandas\core\generic.py in _get_axis_name(cls, axis)
373 pass
374 raise ValueError('No axis named {0} for object type {1}'
--> 375 .format(axis, type(cls)))
376
377 def _get_axis(self, axis):
ValueError: No axis named 1 for object type <class 'type'>
How can I fix it?
I managed to do it eventually with apply and a lambda like this:
df.A = df.A.apply(lambda x: json.loads(x))

Pandas read_csv - non-printable character (columns not recognized)

Could someone tell me what non-printable character I have in my code that makes python not recognize the columns names in my dataframe? :
import pandas as pd
data_olymp = pd.read_csv("Olympics_data.csv", sep=";")
Here is the Traceback of the error when I try to group by teamname :
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-103-ae95f10f5210> in <module>
30 # print(type(réponse1))
31 # print(len(winter_games_bronze_won))
---> 32 print(data_olymp.loc[" winter_games_bronze_won"] == 9)
~\anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
893
894 maybe_callable = com.apply_if_callable(key, self.obj)
--> 895 return self._getitem_axis(maybe_callable, axis=axis)
896
897 def _is_scalar_access(self, key: Tuple):
~\anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
1122 # fall thru to straight lookup
1123 self._validate_key(key, axis)
-> 1124 return self._get_label(key, axis=axis)
1125
1126 def _get_slice_axis(self, slice_obj: slice, axis: int):
~\anaconda3\lib\site-packages\pandas\core\indexing.py in _get_label(self, label, axis)
1071 def _get_label(self, label, axis: int):
1072 # GH#5667 this will fail if the label is not present in the axis.
-> 1073 return self.obj.xs(label, axis=axis)
1074
1075 def _handle_lowerdim_multi_index_axis0(self, tup: Tuple):
~\anaconda3\lib\site-packages\pandas\core\generic.py in xs(self, key, axis, level, drop_level)
3737 raise TypeError(f"Expected label or tuple of labels, got {key}") from e
3738 else:
-> 3739 loc = index.get_loc(key)
3740
3741 if isinstance(loc, np.ndarray):
~\anaconda3\lib\site-packages\pandas\core\indexes\range.py in get_loc(self, key, method, tolerance)
352 except ValueError as err:
353 raise KeyError(key) from err
--> 354 raise KeyError(key)
355 return super().get_loc(key, method=method, tolerance=tolerance)
356
KeyError: ' winter_games_bronze_won'
The file looks like that :
team_name; summer_games_played; summer_games_gold_won; summer_games_silver_won; summer_games_bronze_won; summer_games_medals_won; winter_games_played; winter_games_gold_won; winter_games_silver_won; winter_games_bronze_won; winter_games_medals_won; total_games_played
Canada (CAN);13;0;0;2;2;0;0;0;0;0;13
United States (USA);12;5;2;8;15;3;0;0;0;0;15
Russia (RUS);23;18;24;28;70;18;0;0;0;0;41
Key errors are raised when you are trying to access a key that is not in a dictionary. While working Pandas, it is about the same thing. .loc is trying to locate a key value that is not found in the data frame.
Looking at your code and the traceback error, my assumption is that because you are trying to look up winter_games_bronze_won (with the spaces at the beginning), you are getting the error. Try removing the spaces before winter_games_bronze_won and see what happens.

Replace a string with a shorter version of itself using pandas

I have a pandas dataframe with one column of model variables and their corresponding statistics in another column. I've done some string manipulation to get a derived summary table to join the summary table from the model.
lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('class_cc', case = False), 'variable'] = lost_cost_final_table['variable'].str[:8]
Full traceback.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-229-1dbe5bd14d4b> in <module>
----> 1 lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('class_cc', case = False), 'variable'] = lost_cost_final_table['variable'].str[:8]
2 #lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('class_v_age', case = False), 'variable'] = lost_cost_final_table['variable'].str[:11]
3 #lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('married_age', case = False), 'variable'] = lost_cost_final_table['variable'].str[:11]
4 #lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('state_model', case = False), 'variable'] = lost_cost_final_table['variable'].str[:11]
5
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in __setitem__(self, key, value)
187 key = com._apply_if_callable(key, self.obj)
188 indexer = self._get_setitem_indexer(key)
--> 189 self._setitem_with_indexer(indexer, value)
190
191 def _validate_key(self, key, axis):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _setitem_with_indexer(self, indexer, value)
467
468 if isinstance(value, ABCSeries):
--> 469 value = self._align_series(indexer, value)
470
471 info_idx = indexer[info_axis]
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _align_series(self, indexer, ser, multiindex_indexer)
732 return ser._values.copy()
733
--> 734 return ser.reindex(new_ix)._values
735
736 # 2 dims
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in reindex(self, index, **kwargs)
3323 #Appender(generic._shared_docs['reindex'] % _shared_doc_kwargs)
3324 def reindex(self, index=None, **kwargs):
-> 3325 return super(Series, self).reindex(index=index, **kwargs)
3326
3327 def drop(self, labels=None, axis=0, index=None, columns=None,
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in reindex(self, *args, **kwargs)
3687 # perform the reindex on the axes
3688 return self._reindex_axes(axes, level, limit, tolerance, method,
-> 3689 fill_value, copy).__finalize__(self)
3690
3691 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
3705 obj = obj._reindex_with_indexers({axis: [new_index, indexer]},
3706 fill_value=fill_value,
-> 3707 copy=copy, allow_dups=False)
3708
3709 return obj
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
3808 fill_value=fill_value,
3809 allow_dups=allow_dups,
-> 3810 copy=copy)
3811
3812 if copy and new_data is self._data:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy)
4412 # some axes don't allow reindexing with dups
4413 if not allow_dups:
-> 4414 self.axes[axis]._can_reindex(indexer)
4415
4416 if axis >= self.ndim:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in _can_reindex(self, indexer)
3574 # trying to reindex on an axis with duplicates
3575 if not self.is_unique and len(indexer):
-> 3576 raise ValueError("cannot reindex from a duplicate axis")
3577
3578 def reindex(self, target, method=None, level=None, limit=None,
ValueError: cannot reindex from a duplicate axis
However, when I replace with example, it works and the only difference is the data frame name. See below. I don't see where the difference between the two codes lines are. Any ideas?
variable = ['class_cc-Harley', 'class_cc_Sport', 'class_cc_Other', 'unit_driver_experience']
unique_value = [1200, 1400, 700, 45]
p_value = [.0001, .0001, .0001, .049]
dic = {'variable': variable, 'unique_value':unique_value, 'p_value':p_value}
df = pd.DataFrame(dic)
df.loc[df['variable'].str.contains('class_cc', case = False), 'variable'] = df['variable'].str[:8]
The index of lost_cost_final_table is not unique, which can be fixed by running reset_index:
lost_cost_final_table.reset_index(inplace=True)

Cannot Load From HDF Data Store

I am using pandas 0.11.0 on Windows 64 Bit.
I have a datastore:
store = pandas.io.pytables.HDFStore( r"""C:\data.h5""" )
It has some data in it:
print(store)
<class 'pandas.io.pytables.HDFStore'>
File path: C:\data.h5
/region frame (shape->[30,2])
/sector frame (shape->[116,2])
When I try to load 'sector' from the store I get:
store['/sector']
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-24-7fea069742ed> in <module>()
----> 1 store['/sector']
C:\Anaconda\lib\site-packages\pandas\io\pytables.pyc in __getitem__(self, key)
236
237 def __getitem__(self, key):
--> 238 return self.get(key)
239
240 def __setitem__(self, key, value):
C:\Anaconda\lib\site-packages\pandas\io\pytables.pyc in get(self, key)
369 if group is None:
370 raise KeyError('No object named %s in the file' % key)
--> 371 return self._read_group(group)
372
373 def select(self, key, where=None, start=None, stop=None, columns=None, iterator=False, chunksize=None, **kwargs):
C:\Anaconda\lib\site-packages\pandas\io\pytables.pyc in _read_group(self, group, **kwargs)
876 s = self._create_storer(group)
877 s.infer_axes()
--> 878 return s.read(**kwargs)
879
880 class TableIterator(object):
C:\Anaconda\lib\site-packages\pandas\io\pytables.pyc in read(self, **kwargs)
1982 blocks = []
1983 for i in range(self.nblocks):
-> 1984 blk_items = self.read_index('block%d_items' % i)
1985 values = self.read_array('block%d_values' % i)
1986 blk = make_block(values, blk_items, items)
C:\Anaconda\lib\site-packages\pandas\io\pytables.pyc in read_index(self, key)
1615 return self.read_sparse_intindex(key)
1616 elif variety == 'regular':
-> 1617 _, index = self.read_index_node(getattr(self.group, key))
1618 return index
1619 else: # pragma: no cover
C:\Anaconda\lib\site-packages\pandas\io\pytables.pyc in read_index_node(self, node)
1733 **kwargs)
1734 else:
-> 1735 index = factory(_unconvert_index(data, kind), **kwargs)
1736
1737 index.name = name
TypeError: __new__() got an unexpected keyword argument 'freq'
What is this "freq" argument? And why is it stopping me from loading the data?

KeyError when using s.loc and s.first_valid_index()

I have data similar to this post: pandas: Filling missing values within a group
That is, I have data in a number of observation sessions, and there is a focal individual for each session. That focal individual is only noted once, but I want to fill in the focal ID data for each line during that session. So, the data look something like this:
Focal Session
0 NaN 1
1 50101 1
2 NaN 1
3 NaN 2
4 50408 2
5 NaN 2
Based on the post linked above, I was using this code:
g = data.groupby('Session')
g['Focal'].transform(lambda s: s.loc[s.first_valid_index()])
But this returns a KeyError (specifically, KeyError:None). According to the .loc documentation, KeyErrors can result when the data isn't found. So, I've checked and while I have 152 sessions, I only have 150 non-null data points in the Focal column. Before I decide to manually search my data for which of the sessions is missing a Focal ID, I have two questions:
I am very much a beginner. So is this a reasonable explanation for why I am getting a KeyError?
If it is reasonable, is there a way to figure out which Session is missing Focal ID data, that will save me from manually looking through the data?
Output here:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-330-0e4f27aa7e14> in <module>()
----> 1 data['Focal'] = g['Focal'].transform(lambda s: s.loc[s.first_valid_index()])
2 g['Focal'].transform(lambda s: s.loc[s.first_valid_index()])
//anaconda/lib/python2.7/site-packages/pandas/core/groupby.pyc in transform(self, func, *args, **kwargs)
1540 for name, group in self:
1541 object.__setattr__(group, 'name', name)
-> 1542 res = wrapper(group)
1543 # result[group.index] = res
1544 indexer = self.obj.index.get_indexer(group.index)
//anaconda/lib/python2.7/site-packages/pandas/core/groupby.pyc in <lambda>(x)
1536 wrapper = lambda x: getattr(x, func)(*args, **kwargs)
1537 else:
-> 1538 wrapper = lambda x: func(x, *args, **kwargs)
1539
1540 for name, group in self:
<ipython-input-330-0e4f27aa7e14> in <lambda>(s)
----> 1 data['Focal'] = g['Focal'].transform(lambda s: s.loc[s.first_valid_index()])
2 g['Focal'].transform(lambda s: s.loc[s.first_valid_index()])
//anaconda/lib/python2.7/site-packages/pandas/core/indexing.pyc in __getitem__(self, key)
669 return self._getitem_tuple(key)
670 else:
--> 671 return self._getitem_axis(key, axis=0)
672
673 def _getitem_axis(self, key, axis=0):
//anaconda/lib/python2.7/site-packages/pandas/core/indexing.pyc in _getitem_axis(self, key, axis)
756 return self._getitem_iterable(key, axis=axis)
757 else:
--> 758 return self._get_label(key, axis=axis)
759
760 class _iLocIndexer(_LocationIndexer):
//anaconda/lib/python2.7/site-packages/pandas/core/indexing.pyc in _get_label(self, label, axis)
58 return self.obj._xs(label, axis=axis, copy=False)
59 except Exception:
---> 60 return self.obj._xs(label, axis=axis, copy=True)
61
62 def _get_loc(self, key, axis=0):
//anaconda/lib/python2.7/site-packages/pandas/core/series.pyc in _xs(self, key, axis, level, copy)
570
571 def _xs(self, key, axis=0, level=None, copy=True):
--> 572 return self.__getitem__(key)
573
574 def _ixs(self, i, axis=0):
//anaconda/lib/python2.7/site-packages/pandas/core/series.pyc in __getitem__(self, key)
611 def __getitem__(self, key):
612 try:
--> 613 return self.index.get_value(self, key)
614 except InvalidIndexError:
615 pass
//anaconda/lib/python2.7/site-packages/pandas/core/index.pyc in get_value(self, series, key)
761 """
762 try:
--> 763 return self._engine.get_value(series, key)
764 except KeyError, e1:
765 if len(self) > 0 and self.inferred_type == 'integer':
//anaconda/lib/python2.7/site-packages/pandas/index.so in pandas.index.IndexEngine.get_value (pandas/index.c:2565)()
//anaconda/lib/python2.7/site-packages/pandas/index.so in pandas.index.IndexEngine.get_value (pandas/index.c:2380)()
//anaconda/lib/python2.7/site-packages/pandas/index.so in pandas.index.IndexEngine.get_loc (pandas/index.c:3166)()
KeyError: None
The problem is that first_valid_index returns None if there are no valid values (some groups in your DataFrame are all NaN):
In [1]: s = pd.Series([np.nan])
In [2]: s.first_valid_index() # None
Now, loc throws an error because there is no index None:
In [3]: s.loc[s.first_valid_index()]
KeyError: None
What do you want your code to do in this particular case? ...
If you wanted it to be NaN, you could backfill and then take the first element:
g['Focal'].transform(lambda s: s.bfill().iloc[0])
If you want to fix the problem that some groups contains only Nan you could do the following:
g = data.groupby('Session')
g['Focal'].transform(lambda s: 'No values to aggregate' if pd.isnull(s).all() == True else s.loc[s.first_valid_index()])
df['Focal'] = g['Focal'].transform(lambda s: 'No values to aggregate' if pd.isnull(s).all() == True else s.loc[s.first_valid_index()])
In this way you input 'No Values to aggregate' (or whatever you want) when the program find all Nan for a particular group, instead of blocking the execution to return an error.
Hope this helps :)
Federico

Categories