Facet Grid not working for categorical variables - python

I get an error when trying to create a Facet Grid with Seaborn. I have 3 categorical variables: Gender, Day of the Week, Color. I want to understand the direct correlation of all values within each category to each other.
Gender: Female, Male
Day of the Week: Mo,Tue,Wed,Thu,Fri,Sat,Sun
Color:Red, Green.
g = sns.FacetGrid(tips, col="Gender", row="Color")
g = g.map(plt.hist, "Day of the Week")
display()
Get an error:
KeyError-Traceback (most recent call last)
<command-206114> in <module>()
2 tips = sns.load_dataset("tips")
3
----> 4 g = sns.FacetGrid(tips, col="Gender", row="Color")
5 g = g.map(plt.hist, "Day of the week")
6 display()
/databricks/python/lib/python3.5/site-packages/seaborn/axisgrid.py in __init__(self, data, row, col, hue, col_wrap, sharex, sharey, size, aspect, palette, row_order, col_order, hue_order, hue_kws, dropna, legend_out, despine, margin_titles, xlim, ylim, subplot_kws, gridspec_kws)
240 row_names = []
241 else:
--> 242 row_names = utils.categorical_order(data[row], row_order)
243
244 if col is None:
/databricks/python/lib/python3.5/site-packages/pandas/core/frame.py in __getitem__(self, key)
2057 return self._getitem_multilevel(key)
2058 else:
-> 2059 return self._getitem_column(key)
2060
2061 def _getitem_column(self, key):
/databricks/python/lib/python3.5/site-packages/pandas/core/frame.py in _getitem_column(self, key)
2064 # get column
2065 if self.columns.is_unique:
-> 2066 return self._get_item_cache(key)
2067
2068 # duplicate columns & possible reduce dimensionality
/databricks/python/lib/python3.5/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
1384 res = cache.get(item)
1385 if res is None:
-> 1386 values = self._data.get(item)
1387 res = self._box_item_values(item, values)
1388 cache[item] = res
/databricks/python/lib/python3.5/site-packages/pandas/core/internals.py in get(self, item, fastpath)
3541
3542 if not isnull(item):
-> 3543 loc = self.items.get_loc(item)
3544 else:
3545 indexer = np.arange(len(self.items))[isnull(self.items)]
/databricks/python/lib/python3.5/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
2134 return self._engine.get_loc(key)
2135 except KeyError:
-> 2136 return self._engine.get_loc(self._maybe_cast_indexer(key))
2137
2138 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
KeyError: 'Color'
Does anybody know why this is?

The variable "Color" is the issue here. It could me misspelled . and if you are plotting two categorical varaibles try using a Bar Chart .

Related

Pandas read_csv - non-printable character (columns not recognized)

Could someone tell me what non-printable character I have in my code that makes python not recognize the columns names in my dataframe? :
import pandas as pd
data_olymp = pd.read_csv("Olympics_data.csv", sep=";")
Here is the Traceback of the error when I try to group by teamname :
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-103-ae95f10f5210> in <module>
30 # print(type(réponse1))
31 # print(len(winter_games_bronze_won))
---> 32 print(data_olymp.loc[" winter_games_bronze_won"] == 9)
~\anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
893
894 maybe_callable = com.apply_if_callable(key, self.obj)
--> 895 return self._getitem_axis(maybe_callable, axis=axis)
896
897 def _is_scalar_access(self, key: Tuple):
~\anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
1122 # fall thru to straight lookup
1123 self._validate_key(key, axis)
-> 1124 return self._get_label(key, axis=axis)
1125
1126 def _get_slice_axis(self, slice_obj: slice, axis: int):
~\anaconda3\lib\site-packages\pandas\core\indexing.py in _get_label(self, label, axis)
1071 def _get_label(self, label, axis: int):
1072 # GH#5667 this will fail if the label is not present in the axis.
-> 1073 return self.obj.xs(label, axis=axis)
1074
1075 def _handle_lowerdim_multi_index_axis0(self, tup: Tuple):
~\anaconda3\lib\site-packages\pandas\core\generic.py in xs(self, key, axis, level, drop_level)
3737 raise TypeError(f"Expected label or tuple of labels, got {key}") from e
3738 else:
-> 3739 loc = index.get_loc(key)
3740
3741 if isinstance(loc, np.ndarray):
~\anaconda3\lib\site-packages\pandas\core\indexes\range.py in get_loc(self, key, method, tolerance)
352 except ValueError as err:
353 raise KeyError(key) from err
--> 354 raise KeyError(key)
355 return super().get_loc(key, method=method, tolerance=tolerance)
356
KeyError: ' winter_games_bronze_won'
The file looks like that :
team_name; summer_games_played; summer_games_gold_won; summer_games_silver_won; summer_games_bronze_won; summer_games_medals_won; winter_games_played; winter_games_gold_won; winter_games_silver_won; winter_games_bronze_won; winter_games_medals_won; total_games_played
Canada (CAN);13;0;0;2;2;0;0;0;0;0;13
United States (USA);12;5;2;8;15;3;0;0;0;0;15
Russia (RUS);23;18;24;28;70;18;0;0;0;0;41
Key errors are raised when you are trying to access a key that is not in a dictionary. While working Pandas, it is about the same thing. .loc is trying to locate a key value that is not found in the data frame.
Looking at your code and the traceback error, my assumption is that because you are trying to look up winter_games_bronze_won (with the spaces at the beginning), you are getting the error. Try removing the spaces before winter_games_bronze_won and see what happens.

Pandas sum up to a specific column value

The little piece of code is giving me more trouble than necessary. I would appreciate any help if I could. Thank you in advance for taking a look at this for me.
I am trying to sum up the price and the volume from the previous 22 data points which I added a column named dtt that holds this value. The formula that I am trying to represent here is:
vamp = [sum(1, dtt) price * volume] / [sum(1,dtt) volume]
Here is my code
# Import necessary libraries
import numpy as np
import pandas as pd
import os
# Load SPY dataset
spy_data = pd.read_csv('SPY.csv', parse_dates=['Date'])
# Compute Daily Return
spy_data['daily_ret'] = (spy_data['Adj Close'] - (spy_data['Adj Close']).shift(1)) / ((spy_data['Adj Close']).shift(1)) * 100
spy_data['daily_ret'] = spy_data['daily_ret'].fillna(0.0)
# calculate Annualized Volatility
rsquare = (spy_data['daily_ret']) ** 2
spy_data['annualized_volatility']=(np.sqrt(rsquare.rolling(252).sum() / 251) * np.sqrt(252))
spy_data['annualized_volatility'] = spy_data['annualized_volatility'].fillna(0)
spy_shares = 889112600
# Calculate Days to Trade
spy_data['dtt'] = spy_shares / (spy_data['Volume'].rolling(22).sum()/22)
spy_data['dtt'] = spy_data['dtt'].fillna(1).astype(int)
# Calculate VWAP
#numerator is equal to the sum of the price * volume of the latest DTT
numerator = spy_data.loc[0:spy_data['dtt'], 'Adj Close'].sum()#*spy_data.loc[0:spy_data['dtt'], 'Volume']
#denominator = spy_data.loc[0:spy_data['dtt'], 'Volume']
#spy_data['vwap'] = numerator / denominator
print(spy_data)
I commented out the other lines because I was trying to problem solve it one step at a time. The price column that I need is the Adj Close.
No matter what I try in terms of slicing I keep getting an error. This is the current one:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-68-ade975138e12> in <module>
7 # Calculate VWAP
8 #numerator is equal to the sum of the price * volume of the latest DTT
----> 9 numerator = spy_data.loc[0:spy_data['dtt']].sum()#*spy_data[0:spy_data['dtt'], 'volume']
10 #denominator = spy_data.loc[0:spy_data['dtt'], 'Volume']
11 #spy_data['vwap'] = numerator / denominator
~/.local/lib/python3.6/site-packages/pandas/core/indexing.py in __getitem__(self, key)
1408
1409 maybe_callable = com.apply_if_callable(key, self.obj)
-> 1410 return self._getitem_axis(maybe_callable, axis=axis)
1411
1412 def _is_scalar_access(self, key: Tuple):
~/.local/lib/python3.6/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
1770 if isinstance(key, slice):
1771 self._validate_key(key, axis)
-> 1772 return self._get_slice_axis(key, axis=axis)
1773 elif com.is_bool_indexer(key):
1774 return self._getbool_axis(key, axis=axis)
~/.local/lib/python3.6/site-packages/pandas/core/indexing.py in _get_slice_axis(self, slice_obj, axis)
1438 labels = obj._get_axis(axis)
1439 indexer = labels.slice_indexer(
-> 1440 slice_obj.start, slice_obj.stop, slice_obj.step, kind=self.name
1441 )
1442
~/.local/lib/python3.6/site-packages/pandas/core/indexes/base.py in slice_indexer(self, start, end, step, kind)
5025 slice(1, 3)
5026 """
-> 5027 start_slice, end_slice = self.slice_locs(start, end, step=step, kind=kind)
5028
5029 # return a slice
~/.local/lib/python3.6/site-packages/pandas/core/indexes/base.py in slice_locs(self, start, end, step, kind)
5245 end_slice = None
5246 if end is not None:
-> 5247 end_slice = self.get_slice_bound(end, "right", kind)
5248 if end_slice is None:
5249 end_slice = len(self)
~/.local/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_slice_bound(self, label, side, kind)
5155 # we need to look up the label
5156 try:
-> 5157 slc = self.get_loc(label)
5158 except KeyError as err:
5159 try:
~/.local/lib/python3.6/site-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
377 except ValueError:
378 raise KeyError(key)
--> 379 return super().get_loc(key, method=method, tolerance=tolerance)
380
381 #Appender(_index_shared_docs["get_indexer"])
~/.local/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2888 )
2889 try:
-> 2890 return self._engine.get_loc(key)
2891 except KeyError:
2892 return self._engine.get_loc(self._maybe_cast_indexer(key))
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
TypeError: '0 1
1 1
2 1
3 1
4 1
..
4694 9
4695 9
4696 10
4697 10
4698 11
Name: dtt, Length: 4699, dtype: int64' is an invalid key

Replace a string with a shorter version of itself using pandas

I have a pandas dataframe with one column of model variables and their corresponding statistics in another column. I've done some string manipulation to get a derived summary table to join the summary table from the model.
lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('class_cc', case = False), 'variable'] = lost_cost_final_table['variable'].str[:8]
Full traceback.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-229-1dbe5bd14d4b> in <module>
----> 1 lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('class_cc', case = False), 'variable'] = lost_cost_final_table['variable'].str[:8]
2 #lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('class_v_age', case = False), 'variable'] = lost_cost_final_table['variable'].str[:11]
3 #lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('married_age', case = False), 'variable'] = lost_cost_final_table['variable'].str[:11]
4 #lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('state_model', case = False), 'variable'] = lost_cost_final_table['variable'].str[:11]
5
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in __setitem__(self, key, value)
187 key = com._apply_if_callable(key, self.obj)
188 indexer = self._get_setitem_indexer(key)
--> 189 self._setitem_with_indexer(indexer, value)
190
191 def _validate_key(self, key, axis):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _setitem_with_indexer(self, indexer, value)
467
468 if isinstance(value, ABCSeries):
--> 469 value = self._align_series(indexer, value)
470
471 info_idx = indexer[info_axis]
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _align_series(self, indexer, ser, multiindex_indexer)
732 return ser._values.copy()
733
--> 734 return ser.reindex(new_ix)._values
735
736 # 2 dims
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in reindex(self, index, **kwargs)
3323 #Appender(generic._shared_docs['reindex'] % _shared_doc_kwargs)
3324 def reindex(self, index=None, **kwargs):
-> 3325 return super(Series, self).reindex(index=index, **kwargs)
3326
3327 def drop(self, labels=None, axis=0, index=None, columns=None,
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in reindex(self, *args, **kwargs)
3687 # perform the reindex on the axes
3688 return self._reindex_axes(axes, level, limit, tolerance, method,
-> 3689 fill_value, copy).__finalize__(self)
3690
3691 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
3705 obj = obj._reindex_with_indexers({axis: [new_index, indexer]},
3706 fill_value=fill_value,
-> 3707 copy=copy, allow_dups=False)
3708
3709 return obj
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
3808 fill_value=fill_value,
3809 allow_dups=allow_dups,
-> 3810 copy=copy)
3811
3812 if copy and new_data is self._data:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy)
4412 # some axes don't allow reindexing with dups
4413 if not allow_dups:
-> 4414 self.axes[axis]._can_reindex(indexer)
4415
4416 if axis >= self.ndim:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in _can_reindex(self, indexer)
3574 # trying to reindex on an axis with duplicates
3575 if not self.is_unique and len(indexer):
-> 3576 raise ValueError("cannot reindex from a duplicate axis")
3577
3578 def reindex(self, target, method=None, level=None, limit=None,
ValueError: cannot reindex from a duplicate axis
However, when I replace with example, it works and the only difference is the data frame name. See below. I don't see where the difference between the two codes lines are. Any ideas?
variable = ['class_cc-Harley', 'class_cc_Sport', 'class_cc_Other', 'unit_driver_experience']
unique_value = [1200, 1400, 700, 45]
p_value = [.0001, .0001, .0001, .049]
dic = {'variable': variable, 'unique_value':unique_value, 'p_value':p_value}
df = pd.DataFrame(dic)
df.loc[df['variable'].str.contains('class_cc', case = False), 'variable'] = df['variable'].str[:8]
The index of lost_cost_final_table is not unique, which can be fixed by running reset_index:
lost_cost_final_table.reset_index(inplace=True)

Pandas subtracting group mean from colum value

So i have a dataset from a genechip, where 16 chips measure 1 tissue sample. I would like to subtract from each gene in each chip the mean of this gene over all the chips. Therefore I grouped by gene and calculated the mean. Now I want to take the original PM intensity value and subtract the Mean from this gene.
Thus i need to match the gene column with the the index from the table where i stored the mean value for this gene group and then subtract this value from the PM column.
totalgene = genedata.groupby(genedata['GENE']).mean()[['PM','LOGPM']]
genedata['MEANNORM'] = genedata['PM'] - totalgene.ix[genedata['GENE']]['AVGPM']
genedata['MEANNORM'] = genedata['LOGPM'] - totalgene.ix[genedata['GENE']]['AVGLOGPM']
results in the error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-18-08c1bb979f9c> in <module>()
----> 1 genedata['MEANNORM'] = genedata['PM'] - totalgene.ix[genedata['GENE'],'AVGPM']
2 genedata['MEANNORM'] = genedata['LOGPM'] - totalgene.ix[genedata['GENE'],'AVGLOGPM']
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\frame.py in __setitem__(self, key, value)
2417 else:
2418 # set column
-> 2419 self._set_item(key, value)
2420
2421 def _setitem_slice(self, key, value):
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\frame.py in _set_item(self, key, value)
2483
2484 self._ensure_valid_index(value)
-> 2485 value = self._sanitize_column(key, value)
2486 NDFrame._set_item(self, key, value)
2487
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\frame.py in _sanitize_column(self, key, value, broadcast)
2633
2634 if isinstance(value, Series):
-> 2635 value = reindexer(value)
2636
2637 elif isinstance(value, DataFrame):
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\frame.py in reindexer(value)
2625 # duplicate axis
2626 if not value.index.is_unique:
-> 2627 raise e
2628
2629 # other
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\frame.py in reindexer(value)
2620 # GH 4107
2621 try:
-> 2622 value = value.reindex(self.index)._values
2623 except Exception as e:
2624
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\series.py in reindex(self, index, **kwargs)
2360 #Appender(generic._shared_docs['reindex'] % _shared_doc_kwargs)
2361 def reindex(self, index=None, **kwargs):
-> 2362 return super(Series, self).reindex(index=index, **kwargs)
2363
2364 #Appender(generic._shared_docs['fillna'] % _shared_doc_kwargs)
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\generic.py in reindex(self, *args, **kwargs)
2257 # perform the reindex on the axes
2258 return self._reindex_axes(axes, level, limit, tolerance, method,
-> 2259 fill_value, copy).__finalize__(self)
2260
2261 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\generic.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
2275 obj = obj._reindex_with_indexers({axis: [new_index, indexer]},
2276 fill_value=fill_value,
-> 2277 copy=copy, allow_dups=False)
2278
2279 return obj
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
2369 fill_value=fill_value,
2370 allow_dups=allow_dups,
-> 2371 copy=copy)
2372
2373 if copy and new_data is self._data:
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\internals.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy)
3837 # some axes don't allow reindexing with dups
3838 if not allow_dups:
-> 3839 self.axes[axis]._can_reindex(indexer)
3840
3841 if axis >= self.ndim:
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\indexes\base.py in _can_reindex(self, indexer)
2492 # trying to reindex on an axis with duplicates
2493 if not self.is_unique and len(indexer):
-> 2494 raise ValueError("cannot reindex from a duplicate axis")
2495
2496 def reindex(self, target, method=None, level=None, limit=None,
ValueError: cannot reindex from a duplicate axis
And i have no clue why?
Could somebody help?
Consider transform for an inline aggregate which returns a series that can be subtracted from original columns, PM and LOGPM:
genedata['MEANNORM_PM'] = genedata['PM'] - \
genedata.groupby(['GENE'])['PM'].transform('mean')
genedata['MEANNORM_LOGPM'] = genedata['LOGPM'] - \
genedata.groupby(['GENE'])['LOGPM'].transform('mean')

KeyError: u'no item named 0' comes up with histogram

I've seen some similar questions on here, but none that seem to be having the exact same problem as me. I'm trying to create a histogram of chemical data. The error in other instances seemed to be related to a missing column, but my data doesn't (and shouldn't) have a column named "0". Here is my code and the error message:
%pylab inline
import matplotlib.pyplot as plt
import pandas as pd
plt.figure()
#importing the data
genesis = pd.read_csv(r'C:\Connors Temp\...... (878-15G)\Task_4 (Genesis)\genesis_MWMP.csv')
arsenic = genesis[['Code','Arsenic']]
antimony = genesis[['Code','Antimony']]
plt.hist(antimony)
KeyError Traceback (most recent call last)
<ipython-input-7-c537deba42d9> in <module>()
----> 1 plt.hist(antimony)
C:\Python27\lib\site-packages\matplotlib\pyplot.pyc in hist(x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, hold, **kwargs)
2655 histtype=histtype, align=align, orientation=orientation,
2656 rwidth=rwidth, log=log, color=color, label=label,
-> 2657 stacked=stacked, **kwargs)
2658 draw_if_interactive()
2659 finally:
C:\Python27\lib\site-packages\matplotlib\axes.pyc in hist(self, x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, **kwargs)
8010 # Massage 'x' for processing.
8011 # NOTE: Be sure any changes here is also done below to 'weights'
-> 8012 if isinstance(x, np.ndarray) or not iterable(x[0]):
8013 # TODO: support masked arrays;
8014 x = np.asarray(x)
C:\Python27\lib\site-packages\pandas\core\frame.pyc in __getitem__(self, key)
1805 raise ValueError('Cannot index using non-boolean DataFrame')
1806 else:
-> 1807 return self._get_item_cache(key)
1808
1809 def _getitem_array(self, key):
C:\Python27\lib\site-packages\pandas\core\generic.pyc in _get_item_cache(self, item)
529 return cache[item]
530 except Exception:
--> 531 values = self._data.get(item)
532 res = self._box_item_values(item, values)
533 cache[item] = res
C:\Python27\lib\site-packages\pandas\core\internals.pyc in get(self, item)
828
829 def get(self, item):
--> 830 _, block = self._find_block(item)
831 return block.get(item)
832
C:\Python27\lib\site-packages\pandas\core\internals.pyc in _find_block(self, item)
942
943 def _find_block(self, item):
--> 944 self._check_have(item)
945 for i, block in enumerate(self.blocks):
946 if item in block:
C:\Python27\lib\site-packages\pandas\core\internals.pyc in _check_have(self, item)
949 def _check_have(self, item):
950 if item not in self.items:
--> 951 raise KeyError('no item named %s' % com.pprint_thing(item))
952
953 def reindex_axis(self, new_axis, method=None, axis=0, copy=True):
KeyError: u'no item named 0'
If you're using the lower-level libraries (that is, not pandas's wrappers for them), you probably should use
hist(antimony.Antimony.values)
(see thehist documentation for more).
hist takes a 1-dimensional array. Does this work?
antimony.Antimony.hist()

Categories