I get an error when trying to create a Facet Grid with Seaborn. I have 3 categorical variables: Gender, Day of the Week, Color. I want to understand the direct correlation of all values within each category to each other.
Gender: Female, Male
Day of the Week: Mo,Tue,Wed,Thu,Fri,Sat,Sun
Color:Red, Green.
g = sns.FacetGrid(tips, col="Gender", row="Color")
g = g.map(plt.hist, "Day of the Week")
display()
Get an error:
KeyError-Traceback (most recent call last)
<command-206114> in <module>()
2 tips = sns.load_dataset("tips")
3
----> 4 g = sns.FacetGrid(tips, col="Gender", row="Color")
5 g = g.map(plt.hist, "Day of the week")
6 display()
/databricks/python/lib/python3.5/site-packages/seaborn/axisgrid.py in __init__(self, data, row, col, hue, col_wrap, sharex, sharey, size, aspect, palette, row_order, col_order, hue_order, hue_kws, dropna, legend_out, despine, margin_titles, xlim, ylim, subplot_kws, gridspec_kws)
240 row_names = []
241 else:
--> 242 row_names = utils.categorical_order(data[row], row_order)
243
244 if col is None:
/databricks/python/lib/python3.5/site-packages/pandas/core/frame.py in __getitem__(self, key)
2057 return self._getitem_multilevel(key)
2058 else:
-> 2059 return self._getitem_column(key)
2060
2061 def _getitem_column(self, key):
/databricks/python/lib/python3.5/site-packages/pandas/core/frame.py in _getitem_column(self, key)
2064 # get column
2065 if self.columns.is_unique:
-> 2066 return self._get_item_cache(key)
2067
2068 # duplicate columns & possible reduce dimensionality
/databricks/python/lib/python3.5/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
1384 res = cache.get(item)
1385 if res is None:
-> 1386 values = self._data.get(item)
1387 res = self._box_item_values(item, values)
1388 cache[item] = res
/databricks/python/lib/python3.5/site-packages/pandas/core/internals.py in get(self, item, fastpath)
3541
3542 if not isnull(item):
-> 3543 loc = self.items.get_loc(item)
3544 else:
3545 indexer = np.arange(len(self.items))[isnull(self.items)]
/databricks/python/lib/python3.5/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
2134 return self._engine.get_loc(key)
2135 except KeyError:
-> 2136 return self._engine.get_loc(self._maybe_cast_indexer(key))
2137
2138 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
KeyError: 'Color'
Does anybody know why this is?
The variable "Color" is the issue here. It could me misspelled . and if you are plotting two categorical varaibles try using a Bar Chart .
Related
Could someone tell me what non-printable character I have in my code that makes python not recognize the columns names in my dataframe? :
import pandas as pd
data_olymp = pd.read_csv("Olympics_data.csv", sep=";")
Here is the Traceback of the error when I try to group by teamname :
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-103-ae95f10f5210> in <module>
30 # print(type(réponse1))
31 # print(len(winter_games_bronze_won))
---> 32 print(data_olymp.loc[" winter_games_bronze_won"] == 9)
~\anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
893
894 maybe_callable = com.apply_if_callable(key, self.obj)
--> 895 return self._getitem_axis(maybe_callable, axis=axis)
896
897 def _is_scalar_access(self, key: Tuple):
~\anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
1122 # fall thru to straight lookup
1123 self._validate_key(key, axis)
-> 1124 return self._get_label(key, axis=axis)
1125
1126 def _get_slice_axis(self, slice_obj: slice, axis: int):
~\anaconda3\lib\site-packages\pandas\core\indexing.py in _get_label(self, label, axis)
1071 def _get_label(self, label, axis: int):
1072 # GH#5667 this will fail if the label is not present in the axis.
-> 1073 return self.obj.xs(label, axis=axis)
1074
1075 def _handle_lowerdim_multi_index_axis0(self, tup: Tuple):
~\anaconda3\lib\site-packages\pandas\core\generic.py in xs(self, key, axis, level, drop_level)
3737 raise TypeError(f"Expected label or tuple of labels, got {key}") from e
3738 else:
-> 3739 loc = index.get_loc(key)
3740
3741 if isinstance(loc, np.ndarray):
~\anaconda3\lib\site-packages\pandas\core\indexes\range.py in get_loc(self, key, method, tolerance)
352 except ValueError as err:
353 raise KeyError(key) from err
--> 354 raise KeyError(key)
355 return super().get_loc(key, method=method, tolerance=tolerance)
356
KeyError: ' winter_games_bronze_won'
The file looks like that :
team_name; summer_games_played; summer_games_gold_won; summer_games_silver_won; summer_games_bronze_won; summer_games_medals_won; winter_games_played; winter_games_gold_won; winter_games_silver_won; winter_games_bronze_won; winter_games_medals_won; total_games_played
Canada (CAN);13;0;0;2;2;0;0;0;0;0;13
United States (USA);12;5;2;8;15;3;0;0;0;0;15
Russia (RUS);23;18;24;28;70;18;0;0;0;0;41
Key errors are raised when you are trying to access a key that is not in a dictionary. While working Pandas, it is about the same thing. .loc is trying to locate a key value that is not found in the data frame.
Looking at your code and the traceback error, my assumption is that because you are trying to look up winter_games_bronze_won (with the spaces at the beginning), you are getting the error. Try removing the spaces before winter_games_bronze_won and see what happens.
The little piece of code is giving me more trouble than necessary. I would appreciate any help if I could. Thank you in advance for taking a look at this for me.
I am trying to sum up the price and the volume from the previous 22 data points which I added a column named dtt that holds this value. The formula that I am trying to represent here is:
vamp = [sum(1, dtt) price * volume] / [sum(1,dtt) volume]
Here is my code
# Import necessary libraries
import numpy as np
import pandas as pd
import os
# Load SPY dataset
spy_data = pd.read_csv('SPY.csv', parse_dates=['Date'])
# Compute Daily Return
spy_data['daily_ret'] = (spy_data['Adj Close'] - (spy_data['Adj Close']).shift(1)) / ((spy_data['Adj Close']).shift(1)) * 100
spy_data['daily_ret'] = spy_data['daily_ret'].fillna(0.0)
# calculate Annualized Volatility
rsquare = (spy_data['daily_ret']) ** 2
spy_data['annualized_volatility']=(np.sqrt(rsquare.rolling(252).sum() / 251) * np.sqrt(252))
spy_data['annualized_volatility'] = spy_data['annualized_volatility'].fillna(0)
spy_shares = 889112600
# Calculate Days to Trade
spy_data['dtt'] = spy_shares / (spy_data['Volume'].rolling(22).sum()/22)
spy_data['dtt'] = spy_data['dtt'].fillna(1).astype(int)
# Calculate VWAP
#numerator is equal to the sum of the price * volume of the latest DTT
numerator = spy_data.loc[0:spy_data['dtt'], 'Adj Close'].sum()#*spy_data.loc[0:spy_data['dtt'], 'Volume']
#denominator = spy_data.loc[0:spy_data['dtt'], 'Volume']
#spy_data['vwap'] = numerator / denominator
print(spy_data)
I commented out the other lines because I was trying to problem solve it one step at a time. The price column that I need is the Adj Close.
No matter what I try in terms of slicing I keep getting an error. This is the current one:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-68-ade975138e12> in <module>
7 # Calculate VWAP
8 #numerator is equal to the sum of the price * volume of the latest DTT
----> 9 numerator = spy_data.loc[0:spy_data['dtt']].sum()#*spy_data[0:spy_data['dtt'], 'volume']
10 #denominator = spy_data.loc[0:spy_data['dtt'], 'Volume']
11 #spy_data['vwap'] = numerator / denominator
~/.local/lib/python3.6/site-packages/pandas/core/indexing.py in __getitem__(self, key)
1408
1409 maybe_callable = com.apply_if_callable(key, self.obj)
-> 1410 return self._getitem_axis(maybe_callable, axis=axis)
1411
1412 def _is_scalar_access(self, key: Tuple):
~/.local/lib/python3.6/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
1770 if isinstance(key, slice):
1771 self._validate_key(key, axis)
-> 1772 return self._get_slice_axis(key, axis=axis)
1773 elif com.is_bool_indexer(key):
1774 return self._getbool_axis(key, axis=axis)
~/.local/lib/python3.6/site-packages/pandas/core/indexing.py in _get_slice_axis(self, slice_obj, axis)
1438 labels = obj._get_axis(axis)
1439 indexer = labels.slice_indexer(
-> 1440 slice_obj.start, slice_obj.stop, slice_obj.step, kind=self.name
1441 )
1442
~/.local/lib/python3.6/site-packages/pandas/core/indexes/base.py in slice_indexer(self, start, end, step, kind)
5025 slice(1, 3)
5026 """
-> 5027 start_slice, end_slice = self.slice_locs(start, end, step=step, kind=kind)
5028
5029 # return a slice
~/.local/lib/python3.6/site-packages/pandas/core/indexes/base.py in slice_locs(self, start, end, step, kind)
5245 end_slice = None
5246 if end is not None:
-> 5247 end_slice = self.get_slice_bound(end, "right", kind)
5248 if end_slice is None:
5249 end_slice = len(self)
~/.local/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_slice_bound(self, label, side, kind)
5155 # we need to look up the label
5156 try:
-> 5157 slc = self.get_loc(label)
5158 except KeyError as err:
5159 try:
~/.local/lib/python3.6/site-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
377 except ValueError:
378 raise KeyError(key)
--> 379 return super().get_loc(key, method=method, tolerance=tolerance)
380
381 #Appender(_index_shared_docs["get_indexer"])
~/.local/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2888 )
2889 try:
-> 2890 return self._engine.get_loc(key)
2891 except KeyError:
2892 return self._engine.get_loc(self._maybe_cast_indexer(key))
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
TypeError: '0 1
1 1
2 1
3 1
4 1
..
4694 9
4695 9
4696 10
4697 10
4698 11
Name: dtt, Length: 4699, dtype: int64' is an invalid key
I have a pandas dataframe with one column of model variables and their corresponding statistics in another column. I've done some string manipulation to get a derived summary table to join the summary table from the model.
lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('class_cc', case = False), 'variable'] = lost_cost_final_table['variable'].str[:8]
Full traceback.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-229-1dbe5bd14d4b> in <module>
----> 1 lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('class_cc', case = False), 'variable'] = lost_cost_final_table['variable'].str[:8]
2 #lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('class_v_age', case = False), 'variable'] = lost_cost_final_table['variable'].str[:11]
3 #lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('married_age', case = False), 'variable'] = lost_cost_final_table['variable'].str[:11]
4 #lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('state_model', case = False), 'variable'] = lost_cost_final_table['variable'].str[:11]
5
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in __setitem__(self, key, value)
187 key = com._apply_if_callable(key, self.obj)
188 indexer = self._get_setitem_indexer(key)
--> 189 self._setitem_with_indexer(indexer, value)
190
191 def _validate_key(self, key, axis):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _setitem_with_indexer(self, indexer, value)
467
468 if isinstance(value, ABCSeries):
--> 469 value = self._align_series(indexer, value)
470
471 info_idx = indexer[info_axis]
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _align_series(self, indexer, ser, multiindex_indexer)
732 return ser._values.copy()
733
--> 734 return ser.reindex(new_ix)._values
735
736 # 2 dims
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in reindex(self, index, **kwargs)
3323 #Appender(generic._shared_docs['reindex'] % _shared_doc_kwargs)
3324 def reindex(self, index=None, **kwargs):
-> 3325 return super(Series, self).reindex(index=index, **kwargs)
3326
3327 def drop(self, labels=None, axis=0, index=None, columns=None,
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in reindex(self, *args, **kwargs)
3687 # perform the reindex on the axes
3688 return self._reindex_axes(axes, level, limit, tolerance, method,
-> 3689 fill_value, copy).__finalize__(self)
3690
3691 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
3705 obj = obj._reindex_with_indexers({axis: [new_index, indexer]},
3706 fill_value=fill_value,
-> 3707 copy=copy, allow_dups=False)
3708
3709 return obj
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
3808 fill_value=fill_value,
3809 allow_dups=allow_dups,
-> 3810 copy=copy)
3811
3812 if copy and new_data is self._data:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy)
4412 # some axes don't allow reindexing with dups
4413 if not allow_dups:
-> 4414 self.axes[axis]._can_reindex(indexer)
4415
4416 if axis >= self.ndim:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in _can_reindex(self, indexer)
3574 # trying to reindex on an axis with duplicates
3575 if not self.is_unique and len(indexer):
-> 3576 raise ValueError("cannot reindex from a duplicate axis")
3577
3578 def reindex(self, target, method=None, level=None, limit=None,
ValueError: cannot reindex from a duplicate axis
However, when I replace with example, it works and the only difference is the data frame name. See below. I don't see where the difference between the two codes lines are. Any ideas?
variable = ['class_cc-Harley', 'class_cc_Sport', 'class_cc_Other', 'unit_driver_experience']
unique_value = [1200, 1400, 700, 45]
p_value = [.0001, .0001, .0001, .049]
dic = {'variable': variable, 'unique_value':unique_value, 'p_value':p_value}
df = pd.DataFrame(dic)
df.loc[df['variable'].str.contains('class_cc', case = False), 'variable'] = df['variable'].str[:8]
The index of lost_cost_final_table is not unique, which can be fixed by running reset_index:
lost_cost_final_table.reset_index(inplace=True)
So i have a dataset from a genechip, where 16 chips measure 1 tissue sample. I would like to subtract from each gene in each chip the mean of this gene over all the chips. Therefore I grouped by gene and calculated the mean. Now I want to take the original PM intensity value and subtract the Mean from this gene.
Thus i need to match the gene column with the the index from the table where i stored the mean value for this gene group and then subtract this value from the PM column.
totalgene = genedata.groupby(genedata['GENE']).mean()[['PM','LOGPM']]
genedata['MEANNORM'] = genedata['PM'] - totalgene.ix[genedata['GENE']]['AVGPM']
genedata['MEANNORM'] = genedata['LOGPM'] - totalgene.ix[genedata['GENE']]['AVGLOGPM']
results in the error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-18-08c1bb979f9c> in <module>()
----> 1 genedata['MEANNORM'] = genedata['PM'] - totalgene.ix[genedata['GENE'],'AVGPM']
2 genedata['MEANNORM'] = genedata['LOGPM'] - totalgene.ix[genedata['GENE'],'AVGLOGPM']
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\frame.py in __setitem__(self, key, value)
2417 else:
2418 # set column
-> 2419 self._set_item(key, value)
2420
2421 def _setitem_slice(self, key, value):
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\frame.py in _set_item(self, key, value)
2483
2484 self._ensure_valid_index(value)
-> 2485 value = self._sanitize_column(key, value)
2486 NDFrame._set_item(self, key, value)
2487
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\frame.py in _sanitize_column(self, key, value, broadcast)
2633
2634 if isinstance(value, Series):
-> 2635 value = reindexer(value)
2636
2637 elif isinstance(value, DataFrame):
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\frame.py in reindexer(value)
2625 # duplicate axis
2626 if not value.index.is_unique:
-> 2627 raise e
2628
2629 # other
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\frame.py in reindexer(value)
2620 # GH 4107
2621 try:
-> 2622 value = value.reindex(self.index)._values
2623 except Exception as e:
2624
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\series.py in reindex(self, index, **kwargs)
2360 #Appender(generic._shared_docs['reindex'] % _shared_doc_kwargs)
2361 def reindex(self, index=None, **kwargs):
-> 2362 return super(Series, self).reindex(index=index, **kwargs)
2363
2364 #Appender(generic._shared_docs['fillna'] % _shared_doc_kwargs)
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\generic.py in reindex(self, *args, **kwargs)
2257 # perform the reindex on the axes
2258 return self._reindex_axes(axes, level, limit, tolerance, method,
-> 2259 fill_value, copy).__finalize__(self)
2260
2261 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\generic.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
2275 obj = obj._reindex_with_indexers({axis: [new_index, indexer]},
2276 fill_value=fill_value,
-> 2277 copy=copy, allow_dups=False)
2278
2279 return obj
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
2369 fill_value=fill_value,
2370 allow_dups=allow_dups,
-> 2371 copy=copy)
2372
2373 if copy and new_data is self._data:
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\internals.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy)
3837 # some axes don't allow reindexing with dups
3838 if not allow_dups:
-> 3839 self.axes[axis]._can_reindex(indexer)
3840
3841 if axis >= self.ndim:
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\indexes\base.py in _can_reindex(self, indexer)
2492 # trying to reindex on an axis with duplicates
2493 if not self.is_unique and len(indexer):
-> 2494 raise ValueError("cannot reindex from a duplicate axis")
2495
2496 def reindex(self, target, method=None, level=None, limit=None,
ValueError: cannot reindex from a duplicate axis
And i have no clue why?
Could somebody help?
Consider transform for an inline aggregate which returns a series that can be subtracted from original columns, PM and LOGPM:
genedata['MEANNORM_PM'] = genedata['PM'] - \
genedata.groupby(['GENE'])['PM'].transform('mean')
genedata['MEANNORM_LOGPM'] = genedata['LOGPM'] - \
genedata.groupby(['GENE'])['LOGPM'].transform('mean')
I've seen some similar questions on here, but none that seem to be having the exact same problem as me. I'm trying to create a histogram of chemical data. The error in other instances seemed to be related to a missing column, but my data doesn't (and shouldn't) have a column named "0". Here is my code and the error message:
%pylab inline
import matplotlib.pyplot as plt
import pandas as pd
plt.figure()
#importing the data
genesis = pd.read_csv(r'C:\Connors Temp\...... (878-15G)\Task_4 (Genesis)\genesis_MWMP.csv')
arsenic = genesis[['Code','Arsenic']]
antimony = genesis[['Code','Antimony']]
plt.hist(antimony)
KeyError Traceback (most recent call last)
<ipython-input-7-c537deba42d9> in <module>()
----> 1 plt.hist(antimony)
C:\Python27\lib\site-packages\matplotlib\pyplot.pyc in hist(x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, hold, **kwargs)
2655 histtype=histtype, align=align, orientation=orientation,
2656 rwidth=rwidth, log=log, color=color, label=label,
-> 2657 stacked=stacked, **kwargs)
2658 draw_if_interactive()
2659 finally:
C:\Python27\lib\site-packages\matplotlib\axes.pyc in hist(self, x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, **kwargs)
8010 # Massage 'x' for processing.
8011 # NOTE: Be sure any changes here is also done below to 'weights'
-> 8012 if isinstance(x, np.ndarray) or not iterable(x[0]):
8013 # TODO: support masked arrays;
8014 x = np.asarray(x)
C:\Python27\lib\site-packages\pandas\core\frame.pyc in __getitem__(self, key)
1805 raise ValueError('Cannot index using non-boolean DataFrame')
1806 else:
-> 1807 return self._get_item_cache(key)
1808
1809 def _getitem_array(self, key):
C:\Python27\lib\site-packages\pandas\core\generic.pyc in _get_item_cache(self, item)
529 return cache[item]
530 except Exception:
--> 531 values = self._data.get(item)
532 res = self._box_item_values(item, values)
533 cache[item] = res
C:\Python27\lib\site-packages\pandas\core\internals.pyc in get(self, item)
828
829 def get(self, item):
--> 830 _, block = self._find_block(item)
831 return block.get(item)
832
C:\Python27\lib\site-packages\pandas\core\internals.pyc in _find_block(self, item)
942
943 def _find_block(self, item):
--> 944 self._check_have(item)
945 for i, block in enumerate(self.blocks):
946 if item in block:
C:\Python27\lib\site-packages\pandas\core\internals.pyc in _check_have(self, item)
949 def _check_have(self, item):
950 if item not in self.items:
--> 951 raise KeyError('no item named %s' % com.pprint_thing(item))
952
953 def reindex_axis(self, new_axis, method=None, axis=0, copy=True):
KeyError: u'no item named 0'
If you're using the lower-level libraries (that is, not pandas's wrappers for them), you probably should use
hist(antimony.Antimony.values)
(see thehist documentation for more).
hist takes a 1-dimensional array. Does this work?
antimony.Antimony.hist()