plt.hist errors on subsetted data - python

I'm very new to Python so please excuse any stupidity on my part.
I'm running a histogram with matplotlib and getting errors when I use subset data, the code works perfectly if I use the full dataset, hence my confusion.
Perhaps I'm not subsetting correctly?
My code is below and related errors are below, thanks.
For awareness, this was written in Python 3.
Import required packages:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
Read the data:
mlb=pd.read_csv('C:\Users\ocmh\Desktop\Python\Batting.csv')
View a sample of the data:
mlb.head()
Subset the data to return just Boston data:
mlb_bos=mlb[(mlb['teamID'] == 'BOS')]
View a sample of the subset data:
mlb_bos.head()
Plot a histogram of the original data: And works perfectly
plt.hist(mlb.AB.dropna, color= sns.desaturate("indianred",1))
Plot a histogram of the subset data: Returns errors (Errors are below)
plt.hist(mlb_bos.AB.dropna, color= sns.desaturate("indianred",1))
If you don't have the seaborn package installed, you can just drop color= sns.desaturate("indianred",1) as this was purely for aesthetics.
Errors below:
KeyError Traceback (most recent call last)
<ipython-input-11-1484047d7ac6> in <module>()
----> 1 plt.hist(mlb_bos.AB, color=color)
/Users/mattoconnell/anaconda/lib/python3.4/site-packages/matplotlib/pyplot.py in hist(x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, hold, **kwargs)
2894 histtype=histtype, align=align, orientation=orientation,
2895 rwidth=rwidth, log=log, color=color, label=label,
-> 2896 stacked=stacked, **kwargs)
2897 draw_if_interactive()
2898 finally:
/Users/mattoconnell/anaconda/lib/python3.4/site-packages/matplotlib/axes/_axes.py in hist(self, x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, **kwargs)
5602 # Massage 'x' for processing.
5603 # NOTE: Be sure any changes here is also done below to 'weights'
-> 5604 if isinstance(x, np.ndarray) or not iterable(x[0]):
5605 # TODO: support masked arrays;
5606 x = np.asarray(x)
/Users/mattoconnell/anaconda/lib/python3.4/site-packages/pandas/core/series.py in __getitem__(self, key)
512 def __getitem__(self, key):
513 try:
--> 514 result = self.index.get_value(self, key)
515
516 if not np.isscalar(result):
/Users/mattoconnell/anaconda/lib/python3.4/site-packages/pandas/core/index.py in get_value(self, series, key)
1458
1459 try:
-> 1460 return self._engine.get_value(s, k)
1461 except KeyError as e1:
1462 if len(self) > 0 and self.inferred_type in ['integer','boolean']:
pandas/index.pyx in pandas.index.IndexEngine.get_value (pandas/index.c:3113)()
pandas/index.pyx in pandas.index.IndexEngine.get_value (pandas/index.c:2844)()
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3704)()
pandas/hashtable.pyx in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:7255)()
pandas/hashtable.pyx in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:7193)()
KeyError: 0

Related

How to solve Python OSMNX error message: TypeError: unhashable type: 'dict'

My previous script using OSMNX from Geoff boeing is not working anymore since I did Conda update. It used to run before. This is the bare part of the script that gives the error message
import osmnx as ox
import pandas as pd
import geopandas as gpd
import networkx as nx
import numpy as np
# Set place and language; It must return a POLYGON/POLYLINE, not a POINT, so you might have to play with it a little, or set which_result below accordingly
place='ALmere, Netherlands'
# note the which_result parameter, as per comment above. Default which_result=1. For places like Utrecht changing it gives a different result
G = ox.graph_from_place(place, network_type='all', which_result=1)
# For the colouring, we take the attributes from each edge found extract the road name, and use the function above to create the colour array
edge_attributes = ox.graph_to_gdfs(G, nodes=False)
Gives error message:
TypeError: unhashable type: 'dict'
I seem to have the last version of Osmnx (conda list shows 0.16.1). I did find this question, but can't translate that to my code: TypeError: unhashable type: 'dict' in Networkx random walk code that was previously working
And this one: https://github.com/gboeing/osmnx/issues/372. My Python version is 3.8.5
Traceback below:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-1-6a868604da3b> in <module>
10
11 # note the which_result parameter, as per comment above. Default which_result=1. For places like Utrecht changing it gives a different result
---> 12 G = ox.graph_from_place(place, network_type='all', which_result=1)
13
14 # For the colouring, we take the attributes from each edge found extract the road name, and use the function above to create the colour array
~\Anaconda3\lib\site-packages\osmnx\core.py in graph_from_place(query, network_type, simplify, retain_all, truncate_by_edge, name, which_result, buffer_dist, timeout, memory, max_query_area_size, clean_periphery, infrastructure, custom_filter)
1443 max_query_area_size=max_query_area_size,
1444 clean_periphery=clean_periphery, infrastructure=infrastructure,
-> 1445 custom_filter=custom_filter)
1446
1447 log('graph_from_place() returning graph with {:,} nodes and {:,} edges'.format(len(list(G.nodes())), len(list(G.edges()))))
~\Anaconda3\lib\site-packages\osmnx\core.py in graph_from_polygon(polygon, network_type, simplify, retain_all, truncate_by_edge, name, timeout, memory, max_query_area_size, clean_periphery, infrastructure, custom_filter)
1319 G_buffered = create_graph(response_jsons, name=name, retain_all=True,
1320 bidirectional=network_type in settings.bidirectional_network_types)
-> 1321 G_buffered = truncate_graph_polygon(G_buffered, polygon_buffered, retain_all=True, truncate_by_edge=truncate_by_edge)
1322
1323 # simplify the graph topology
~\Anaconda3\lib\site-packages\osmnx\core.py in truncate_graph_polygon(G, polygon, retain_all, truncate_by_edge, quadrat_width, min_num, buffer_amount)
731
732 # find all the nodes in the graph that lie outside the polygon
--> 733 points_within_geometry = intersect_index_quadrats(gdf_nodes, polygon, quadrat_width=quadrat_width, min_num=min_num, buffer_amount=buffer_amount)
734 nodes_outside_polygon = gdf_nodes[~gdf_nodes.index.isin(points_within_geometry.index)]
735
~\Anaconda3\lib\site-packages\osmnx\core.py in intersect_index_quadrats(gdf, geometry, quadrat_width, min_num, buffer_amount)
678 # drop duplicate points, if buffered poly caused an overlap on point(s)
679 # that lay directly on a quadrat line
--> 680 points_within_geometry = points_within_geometry.drop_duplicates(subset='node')
681 else:
682 # after simplifying the graph, and given the requested network type,
~\Anaconda3\lib\site-packages\pandas\core\frame.py in drop_duplicates(self, subset, keep, inplace, ignore_index)
5106
5107 inplace = validate_bool_kwarg(inplace, "inplace")
-> 5108 duplicated = self.duplicated(subset, keep=keep)
5109
5110 result = self[-duplicated]
~\Anaconda3\lib\site-packages\pandas\core\frame.py in duplicated(self, subset, keep)
5245
5246 vals = (col.values for name, col in self.items() if name in subset)
-> 5247 labels, shape = map(list, zip(*map(f, vals)))
5248
5249 ids = get_group_index(labels, shape, sort=False, xnull=False)
~\Anaconda3\lib\site-packages\pandas\core\frame.py in f(vals)
5220 def f(vals):
5221 labels, shape = algorithms.factorize(
-> 5222 vals, size_hint=min(len(self), _SIZE_HINT_LIMIT)
5223 )
5224 return labels.astype("i8", copy=False), len(shape)
~\Anaconda3\lib\site-packages\pandas\core\algorithms.py in factorize(values, sort, na_sentinel, size_hint)
676
677 codes, uniques = _factorize_array(
--> 678 values, na_sentinel=na_sentinel, size_hint=size_hint, na_value=na_value
679 )
680
~\Anaconda3\lib\site-packages\pandas\core\algorithms.py in _factorize_array(values, na_sentinel, size_hint, na_value, mask)
499 table = hash_klass(size_hint or len(values))
500 uniques, codes = table.factorize(
--> 501 values, na_sentinel=na_sentinel, na_value=na_value, mask=mask
502 )
503
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.factorize()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable._unique()
TypeError: unhashable type: 'dict'

A squarred variable is outside the index

A variation of this post, without the detailed traceback, had been posted in the SO about two hours ago. This version contains the whole traceback.)
I am running StatsModels to get parameter estimates from ordinary least-squares (OLS). Data-processing and model-specific commands are shown below. When I use import statsmodels.formula.api as smas the operative api, the OLS works as desired (after I drop some 15 rows programmatically), giving intuitive results. But when I switch to import statsmodels.api as sm as the binding api, without changing the code almost at all, things fall apart, and Python interpreter triggers an error saying that 'inc_2 is not in the index'. Mind you, inc_2 was computed after the dataframe was read into StatsModels in both model runs: and yet the run was successful in the first, but not in the second. (BTW, p_c_inc_18 is per-capita income, and inc_2 is the former squarred. inc_2 is the offensive element in the second run.)
import pandas as pd
import numpy as np
import statsmodels.api as sm
%matplotlib inline import
matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid") eg=pd.read_csv(r'C:/../../../une_edu_pipc_06.csv') pd.options.display.precision = 3
plt.rc("figure", figsize=(16,8))
plt.rc("font", size=14)
sm_col = eg["lt_hsd_17"] + eg["hsd_17"]
eg["ut_hsd_17"] = sm_col
sm_col2 = eg["sm_col_17"] + eg["col_17"] eg["bnd_hsd_17"] = sm_col2
eg["d_09"]= eg["Rate_09"]-eg["Rate_06"]
eg["d_10"]= eg["Rate_10"]-eg["Rate_06"] inc_2=eg["p_c_inc_18"]*eg["p_c_inc_18"]
X = eg[["p_c_inc_18","ut_hsd_17","d_10","inc_2"]]
y = eg["Rate_18"]
X = sm.add_constant(X)
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())
Here is the traceback in full.
KeyError Traceback (most recent call last)
<ipython-input-21-e2f4d325145e> in <module>
17 eg["d_10"]= eg["Rate_10"]-eg["Rate_06"]
18 inc_2=eg["p_c_inc_18"]*eg["p_c_inc_18"]
---> 19 X = eg[["p_c_inc_18","ut_hsd_17","d_10","inc_2"]]
20 y = eg["Rate_18"]
21 X = sm.add_constant(X)
~\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2804 if is_iterator(key):
2805 key = list(key)
-> 2806 indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1]
2807
2808 # take() does not accept boolean indexers
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
1550 keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr)
1551
-> 1552 self._validate_read_indexer(
1553 keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing
1554 )
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
1644 if not (self.name == "loc" and not raise_missing):
1645 not_found = list(set(key) - set(ax))
-> 1646 raise KeyError(f"{not_found} not in index")
1647
1648 # we skip the warning on Categorical/Interval
KeyError: "['inc_2'] not in index"
What am I doing wrong?
The syntax you used insists that a list of strings is a legal index into eg. If you print(eg), you'll see that it has no such element. I think what you meant was to make a list of elements, each indexed by a single string.
X = [
eg["p_c_inc_18"],
eg["ut_hsd_17"],
eg["d_10"],
eg["inc_2"]
]

Unable to plot linear regression scatter plot and predicting line

I have tried lot of methods and went through many questions that are already answered over here but in vain. None of it could solve my problem. So please if you can't solve this problem at least don't mark it as 'duplicate' because am desparately looking forward to make my model work and am stuck at this stupid syntactical error.
So, my query is pretty simple, I have a data frame that consists of 2 columns 1st is 'sqft_living' and the second is 'price'. So, I have used linear regression to predict price based on its sqft area. I want to visualize this, as scatter plot followed by a best fit line. However, am getting this error:
TypeError: unhashable type: 'numpy.ndarray'
I have already converted the dataframe into series and corrected dimensions as well still am getting this error.
Please provide me with solution code along with exploitation.
Any help will be highly appreciated as I am stuck with this and can't complete my assignment.
below is the exact code and error i am getting.
import numpy as np
import matplotlib.pyplot as plt # To visualize
import pandas as pd # To read data
from sklearn.linear_model import LinearRegression
X = poly1_data.iloc[:, 0].values.reshape(-1, 1) # values converts it into a numpy array
Y = poly1_data.iloc[:, 1].values.reshape(-1, 1) # -1 means that calculate the dimension of rows, but have 1 column
linear_regressor = LinearRegression() # create object for the class
linear_regressor.fit(X, Y) # perform linear regression
Y_pred = linear_regressor.predict(X) # make predictions
plt.scatter(X, Y)
plt.plot(X, Y_pred, color='red')
plt.show()
--------------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-245-96227c9411b1> in <module>
9 linear_regressor.fit(X, Y) # perform linear regression
10 Y_pred = linear_regressor.predict(X) # make predictions
---> 11 plt.scatter(X, Y)
12 plt.plot(X, Y_pred, color='red')
13 plt.show()
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py in scatter(x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, data, **kwargs)
2860 vmin=vmin, vmax=vmax, alpha=alpha, linewidths=linewidths,
2861 verts=verts, edgecolors=edgecolors, **({"data": data} if data
-> 2862 is not None else {}), **kwargs)
2863 sci(__ret)
2864 return __ret
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py in inner(ax, data, *args, **kwargs)
1808 "the Matplotlib list!)" % (label_namer, func.__name__),
1809 RuntimeWarning, stacklevel=2)
-> 1810 return func(ax, *args, **kwargs)
1811
1812 inner.__doc__ = _add_data_doc(inner.__doc__,
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axes\_axes.py in scatter(self, x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, **kwargs)
4170 edgecolors = 'face'
4171
-> 4172 self._process_unit_info(xdata=x, ydata=y, kwargs=kwargs)
4173 x = self.convert_xunits(x)
4174 y = self.convert_yunits(y)
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axes\_base.py in _process_unit_info(self, xdata, ydata, kwargs)
2133 return kwargs
2134
-> 2135 kwargs = _process_single_axis(xdata, self.xaxis, 'xunits', kwargs)
2136 kwargs = _process_single_axis(ydata, self.yaxis, 'yunits', kwargs)
2137 return kwargs
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axes\_base.py in _process_single_axis(data, axis, unit_name, kwargs)
2116 # We only need to update if there is nothing set yet.
2117 if not axis.have_units():
-> 2118 axis.update_units(data)
2119
2120 # Check for units in the kwargs, and if present update axis
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axis.py in update_units(self, data)
1471 neednew = self.converter != converter
1472 self.converter = converter
-> 1473 default = self.converter.default_units(data, self)
1474 if default is not None and self.units is None:
1475 self.set_units(default)
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\category.py in default_units(data, axis)
101 # default_units->axis_info->convert
102 if axis.units is None:
--> 103 axis.set_units(UnitData(data))
104 else:
105 axis.units.update(data)
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\category.py in __init__(self, data)
167 self._counter = itertools.count()
168 if data is not None:
--> 169 self.update(data)
170
171 def update(self, data):
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\category.py in update(self, data)
184 data = np.atleast_1d(np.array(data, dtype=object))
185
--> 186 for val in OrderedDict.fromkeys(data):
187 if not isinstance(val, (str, bytes)):
188 raise TypeError("{val!r} is not a string".format(val=val))
TypeError: unhashable type: 'numpy.ndarray'
````[I am getting this image without any scatter plot and best fit line][1]
[1]: https://i.stack.imgur.com/9jccu.png

numpy.histogram fails after updating anaconda

I have been using the matplotlib function plt.hist to generate histogram data from an array of values mV. This has worked fine in the past, but ever since I've updated my version of anaconda it throws back a ValueError:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/users/benjatin/HomeData/Code/buildSkyMap.py in <module>()
72 #get histogram of efficiencies
73 plt.figure()
---> 74 a=plt.hist(mV,bins=50)
75 plt.close()
76
/opt/apps/lsst/feb14/Linux64/anaconda/2.1.0/lib/python2.7/site-packages/matplotlib/pyplot.pyc in hist(x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, hold, **kwargs)
2888 histtype=histtype, align=align, orientation=orientation,
2889 rwidth=rwidth, log=log, color=color, label=label,
-> 2890 stacked=stacked, **kwargs)
2891 draw_if_interactive()
2892 finally:
/opt/apps/lsst/feb14/Linux64/anaconda/2.1.0/lib/python2.7/site-packages/matplotlib/axes/_axes.pyc in hist(self, x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, **kwargs)
5634 # this will automatically overwrite bins,
5635 # so that each histogram uses the same bins
-> 5636 m, bins = np.histogram(x[i], bins, weights=w[i], **hist_kwargs)
5637 m = m.astype(float) # causes problems later if it's an int
5638 if mlast is None:
/opt/apps/lsst/feb14/Linux64/anaconda/2.1.0/lib/python2.7/site-packages/numpy/lib/function_base.pyc in histogram(a, bins, range, normed, weights, density)
598 n.imag += np.bincount(indices, weights=tmp_w.imag, minlength=bins)
599 else:
--> 600 n += np.bincount(indices, weights=tmp_w, minlength=bins).astype(ntype)
601
602 # We now compute the bin edges since these are returned
ValueError: The first argument of bincount must be non-negative
None of the values in mV are negative, as was the problem here:
In [34]: mV[mV < 0]
Out[34]: array([], dtype=float64)
The update I did was:
conda: 3.7.0-py27_0 --> 4.0.5-py27_0 (soft-link)
openssl: 1.0.1h-1 --> 1.0.2h-0 (soft-link)
python: 2.7.8-1 --> 2.7.11-0 (soft-link)
pyyaml: 3.11-py27_0 --> 3.11-py27_1 (soft-link)
requests: 2.4.1-py27_0 --> 2.9.1-py27_0 (soft-link)
sqlite: 3.8.4.1-0 --> 3.9.2-0 (soft-link)
tk: 8.5.15-0 --> 8.5.18-0 (soft-link)
yaml: 0.1.4-0 --> 0.1.6-0 (soft-link)
zlib: 1.2.7-0 --> 1.2.8-0 (soft-link)
Thanks in advance for any help solving this issue.
Filter out any nan and inf from your data before plotting the histogram. See the bug report here.

silhouette coefficient in python with sklearn

I'm having trouble computing the silhouette coefficient in python with sklearn.
Here is my code :
from sklearn import datasets
from sklearn.metrics import *
iris = datasets.load_iris()
X = pd.DataFrame(iris.data, columns = col)
y = pd.DataFrame(iris.target,columns = ['cluster'])
s = silhouette_score(X, y, metric='euclidean',sample_size=int(50))
I get the error :
IndexError: indices are out-of-bounds
I want to use the sample_size parameter because when working with very large datasets, silhouette is too long to compute. Anyone knows how this parameter could work ?
Complete traceback :
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-72-70ff40842503> in <module>()
4 X = pd.DataFrame(iris.data, columns = col)
5 y = pd.DataFrame(iris.target,columns = ['cluster'])
----> 6 s = silhouette_score(X, y, metric='euclidean',sample_size=50)
/usr/local/lib/python2.7/dist-packages/sklearn/metrics/cluster/unsupervised.pyc in silhouette_score(X, labels, metric, sample_size, random_state, **kwds)
81 X, labels = X[indices].T[indices].T, labels[indices]
82 else:
---> 83 X, labels = X[indices], labels[indices]
84 return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))
85
/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in __getitem__(self, key)
1993 if isinstance(key, (np.ndarray, list)):
1994 # either boolean or fancy integer index
-> 1995 return self._getitem_array(key)
1996 elif isinstance(key, DataFrame):
1997 return self._getitem_frame(key)
/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in _getitem_array(self, key)
2030 else:
2031 indexer = self.ix._convert_to_indexer(key, axis=1)
-> 2032 return self.take(indexer, axis=1, convert=True)
2033
2034 def _getitem_multilevel(self, key):
/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in take(self, indices, axis, convert)
2981 if convert:
2982 axis = self._get_axis_number(axis)
-> 2983 indices = _maybe_convert_indices(indices, len(self._get_axis(axis)))
2984
2985 if self._is_mixed_type:
/usr/local/lib/python2.7/dist-packages/pandas/core/indexing.pyc in _maybe_convert_indices(indices, n)
1038 mask = (indices>=n) | (indices<0)
1039 if mask.any():
-> 1040 raise IndexError("indices are out-of-bounds")
1041 return indices
1042
IndexError: indices are out-of-bounds
silhouette_score expects regular numpy arrays as input. Why wrap your arrays in data frames?
>>> silhouette_score(iris.data, iris.target, sample_size=50)
0.52999903616584543
From the traceback, you can observe that the code is doing fancy indexing (subsampling) on the first axis. By default indexing a dataframe will index the columns and not the rows hence the issue you observe.

Categories