I have tried lot of methods and went through many questions that are already answered over here but in vain. None of it could solve my problem. So please if you can't solve this problem at least don't mark it as 'duplicate' because am desparately looking forward to make my model work and am stuck at this stupid syntactical error.
So, my query is pretty simple, I have a data frame that consists of 2 columns 1st is 'sqft_living' and the second is 'price'. So, I have used linear regression to predict price based on its sqft area. I want to visualize this, as scatter plot followed by a best fit line. However, am getting this error:
TypeError: unhashable type: 'numpy.ndarray'
I have already converted the dataframe into series and corrected dimensions as well still am getting this error.
Please provide me with solution code along with exploitation.
Any help will be highly appreciated as I am stuck with this and can't complete my assignment.
below is the exact code and error i am getting.
import numpy as np
import matplotlib.pyplot as plt # To visualize
import pandas as pd # To read data
from sklearn.linear_model import LinearRegression
X = poly1_data.iloc[:, 0].values.reshape(-1, 1) # values converts it into a numpy array
Y = poly1_data.iloc[:, 1].values.reshape(-1, 1) # -1 means that calculate the dimension of rows, but have 1 column
linear_regressor = LinearRegression() # create object for the class
linear_regressor.fit(X, Y) # perform linear regression
Y_pred = linear_regressor.predict(X) # make predictions
plt.scatter(X, Y)
plt.plot(X, Y_pred, color='red')
plt.show()
--------------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-245-96227c9411b1> in <module>
9 linear_regressor.fit(X, Y) # perform linear regression
10 Y_pred = linear_regressor.predict(X) # make predictions
---> 11 plt.scatter(X, Y)
12 plt.plot(X, Y_pred, color='red')
13 plt.show()
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py in scatter(x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, data, **kwargs)
2860 vmin=vmin, vmax=vmax, alpha=alpha, linewidths=linewidths,
2861 verts=verts, edgecolors=edgecolors, **({"data": data} if data
-> 2862 is not None else {}), **kwargs)
2863 sci(__ret)
2864 return __ret
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py in inner(ax, data, *args, **kwargs)
1808 "the Matplotlib list!)" % (label_namer, func.__name__),
1809 RuntimeWarning, stacklevel=2)
-> 1810 return func(ax, *args, **kwargs)
1811
1812 inner.__doc__ = _add_data_doc(inner.__doc__,
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axes\_axes.py in scatter(self, x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, **kwargs)
4170 edgecolors = 'face'
4171
-> 4172 self._process_unit_info(xdata=x, ydata=y, kwargs=kwargs)
4173 x = self.convert_xunits(x)
4174 y = self.convert_yunits(y)
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axes\_base.py in _process_unit_info(self, xdata, ydata, kwargs)
2133 return kwargs
2134
-> 2135 kwargs = _process_single_axis(xdata, self.xaxis, 'xunits', kwargs)
2136 kwargs = _process_single_axis(ydata, self.yaxis, 'yunits', kwargs)
2137 return kwargs
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axes\_base.py in _process_single_axis(data, axis, unit_name, kwargs)
2116 # We only need to update if there is nothing set yet.
2117 if not axis.have_units():
-> 2118 axis.update_units(data)
2119
2120 # Check for units in the kwargs, and if present update axis
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axis.py in update_units(self, data)
1471 neednew = self.converter != converter
1472 self.converter = converter
-> 1473 default = self.converter.default_units(data, self)
1474 if default is not None and self.units is None:
1475 self.set_units(default)
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\category.py in default_units(data, axis)
101 # default_units->axis_info->convert
102 if axis.units is None:
--> 103 axis.set_units(UnitData(data))
104 else:
105 axis.units.update(data)
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\category.py in __init__(self, data)
167 self._counter = itertools.count()
168 if data is not None:
--> 169 self.update(data)
170
171 def update(self, data):
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\category.py in update(self, data)
184 data = np.atleast_1d(np.array(data, dtype=object))
185
--> 186 for val in OrderedDict.fromkeys(data):
187 if not isinstance(val, (str, bytes)):
188 raise TypeError("{val!r} is not a string".format(val=val))
TypeError: unhashable type: 'numpy.ndarray'
````[I am getting this image without any scatter plot and best fit line][1]
[1]: https://i.stack.imgur.com/9jccu.png
Related
I am trying to plot my predicted values against my independent variable
if they were the same shape to be able to fit into model why would not they be the same size? yes they are not the same since X has multiple attributes as Y is one independent variable but still, I do not understand how to plot them
Code:
# generate random data-set
np.random.seed(0)
x = df1
y = subjects["heart_rate"]
#x= x.values.reshape(-1, 1)
#y= y.values.reshape(-1, 1)
#x.shape[0] != y.shape[0]
# sckit-learn implementation
# Model initialization
regression_model = LinearRegression()
# Fit the data(train the model)
regression_model.fit(x, y)
# Predict
y_predicted = regression_model.predict(x)
# model evaluation
rmse = mean_squared_error(y, y_predicted)
r2 = r2_score(y, y_predicted)
# printing values
print('Slope:' ,regression_model.coef_)
print('Intercept:', regression_model.intercept_)
print('Root mean squared error: ', rmse)
print('R2 score: ', r2)
# plotting values
# data points
plt.scatter(x, y, s=10)
plt.xlabel('x')
plt.ylabel('y')
# predicted values
plt.plot(x, y_predicted, color='r')
plt.show()
Error:
Slope: [-5.37064533 -0.50880666 -0.09348247 -1.33726289 0.18639004 1.42313131
-0.2752906 0.26580939 -0.17365683 -0.13841734]
Intercept: 274.5378817549546
Root mean squared error: 498.2107065101733
R2 score: 0.31625521784258237
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-179-a4bdd40003d3> in <module>()
31 # data points
32
---> 33 plt.scatter(x, y, s=10)
34 plt.xlabel('x')
35 plt.ylabel('y')
c:\python36\lib\site-packages\matplotlib\pyplot.py in scatter(x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, hold, data, **kwargs)
3473 vmin=vmin, vmax=vmax, alpha=alpha,
3474 linewidths=linewidths, verts=verts,
-> 3475 edgecolors=edgecolors, data=data, **kwargs)
3476 finally:
3477 ax._hold = washold
c:\python36\lib\site-packages\matplotlib\__init__.py in inner(ax, *args, **kwargs)
1865 "the Matplotlib list!)" % (label_namer, func.__name__),
1866 RuntimeWarning, stacklevel=2)
-> 1867 return func(ax, *args, **kwargs)
1868
1869 inner.__doc__ = _add_data_doc(inner.__doc__,
c:\python36\lib\site-packages\matplotlib\axes\_axes.py in scatter(self, x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, **kwargs)
4255 y = np.ma.ravel(y)
4256 if x.size != y.size:
-> 4257 raise ValueError("x and y must be the same size")
4258
4259 if s is None:
ValueError: x and y must be the same size
X:
IMU_hand_temp hand_acceleration_16_1 hand_acceleration_16_2 hand_acceleration_16_3 hand_gyroscope_rad_7 hand_gyroscope_rad_8 hand_gyroscope_rad_9 hand_magnetometer_μT_10 hand_magnetometer_μT_11 hand_magnetometer_μT_12
0 30.375 2.21530 8.27915 5.58753 -0.004750 0.037579 -0.011145 8.93200 -67.9326 -19.9755
1 30.375 2.29196 7.67288 5.74467 -0.171710 0.025479 -0.009538 9.58300 -67.9584 -20.9091
2 30.375 2.29090 7.14240 5.82342 -0.238241 0.011214 0.000831 9.05516 -67.4017 -19.5083
3 30.375 2.21800 7.14365 5.89930 -0.192912 0.019053 0.013374 9.92698 -67.4387 -20.5602
4 30.375 2.30106 7.25857 6.09259 -0.069961 -0.018328 0.004582 9.15626 -67.1825 -20.0857
... ... ... ... ... ... ... ... ... ... ...
1942867 25.125 4.99466 6.01881 5.59830 -0.289166 -0.110170 0.238570 -4.79353 -18.1271 -48.2695
1942868 25.125 5.02764 5.90369 5.48372 -0.275411 -0.128358 0.267409 -4.54101 -18.0169 -48.9268
1942869 25.125 5.06409 5.71370 5.48491 -0.289885 -0.126548 0.281483 -4.17401 -17.9121 -48.4032
1942870 25.125 5.13914 5.63724 5.48629 -0.234417 -0.101485 0.275497 -4.66091 -18.2588 -49.0563
1942871 25.125 5.00812 5.40645 5.02326 -0.260924 -0.093849 0.266205 -5.05008 -17.6169 -48.1408
y:
0 100.0
1 100.0
2 100.0
3 100.0
4 100.0
...
1942867 162.0
1942868 162.0
1942869 162.0
1942870 162.0
1942871 162.0
Name: heart_rate, Length: 1942872, dtype: float64
You are trying to generate a scatter plot of the DataFrame x (a 1942871 X N object) against the Series y. The code fails because x has more elements in total than y.
print('size of x = {0}'.format(x.size))
print('size of y = {0}'.format(y.size))
assert x.size == y.size
The sizes are not equal, hence the code fails.
If you must have a scatter plot of x against y, do so on a column-by-column basis
for col in x.columns:
plt.scatter(x[col], y, s=10)
I am trying to create histograms for feature analysis to see how similar high grade tumor and low grade tumor characteristics are. Something similar to the histogram shown below:
Matrix "Z" looks like this and contains 106 features
Matrix "Z"
The code I am using is:
#Import data
data = pd.read_csv("/Users/MLuser/Desktop/Data table - CM_Transposed.csv")
# Prproccess data
bins = (0,2,4)
group_names = ['low_grade','high_grade']
data['FGrade'] = pd.cut(data['FGrade'], bins=bins, labels=group_names)
label_grade = LabelEncoder()
data['FGrade'] = label_grade.fit_transform(data['FGrade'])
Z=data
# Separate the dataset as response variable and feature variables
Z = Z.drop('Feature Name', axis = 1)
Z.columns
low_grade=Z[Z.FGrade==0] # define malignant
high_grade=Z[Z.FGrade==1] # define benign
for i, col in enumerate(Z.columns):
plt.figure(i)
sns.distplot(low_grade.iloc[:,i], color='red')
sns.distplot(high_grade.iloc[:,i], color='g')
Unfortunately when I run the above, I get a single graph and the following error:
---------------------------------------------------------------------------
LinAlgError Traceback (most recent call last)
<ipython-input-25-79d6e8cd51a8> in <module>
27 for i, col in enumerate(Z.columns):
28 plt.figure(i)
---> 29 sns.distplot(low_grade.iloc[:,i], color='red')
30 sns.distplot(high_grade.iloc[:,i], color='g')
31
~/env/lib/python3.6/site-packages/seaborn/distributions.py in distplot(a, bins, hist, kde, rug, fit, hist_kws, kde_kws, rug_kws, fit_kws, color, vertical, norm_hist, axlabel, label, ax)
229 if kde:
230 kde_color = kde_kws.pop("color", color)
--> 231 kdeplot(a, vertical=vertical, ax=ax, color=kde_color, **kde_kws)
232 if kde_color != color:
233 kde_kws["color"] = kde_color
~/env/lib/python3.6/site-packages/seaborn/distributions.py in kdeplot(data, data2, shade, vertical, kernel, bw, gridsize, cut, clip, legend, cumulative, shade_lowest, cbar, cbar_ax, cbar_kws, ax, **kwargs)
689 ax = _univariate_kdeplot(data, shade, vertical, kernel, bw,
690 gridsize, cut, clip, legend, ax,
--> 691 cumulative=cumulative, **kwargs)
692
693 return ax
~/env/lib/python3.6/site-packages/seaborn/distributions.py in _univariate_kdeplot(data, shade, vertical, kernel, bw, gridsize, cut, clip, legend, ax, cumulative, **kwargs)
292 "only implemented in statsmodels."
293 "Please install statsmodels.")
--> 294 x, y = _scipy_univariate_kde(data, bw, gridsize, cut, clip)
295
296 # Make sure the density is nonnegative
~/env/lib/python3.6/site-packages/seaborn/distributions.py in _scipy_univariate_kde(data, bw, gridsize, cut, clip)
364 """Compute a univariate kernel density estimate using scipy."""
365 try:
--> 366 kde = stats.gaussian_kde(data, bw_method=bw)
367 except TypeError:
368 kde = stats.gaussian_kde(data)
~/env/lib/python3.6/site-packages/scipy/stats/kde.py in __init__(self, dataset, bw_method, weights)
206 self._neff = 1/sum(self._weights**2)
207
--> 208 self.set_bandwidth(bw_method=bw_method)
209
210 def evaluate(self, points):
~/env/lib/python3.6/site-packages/scipy/stats/kde.py in set_bandwidth(self, bw_method)
552 raise ValueError(msg)
553
--> 554 self._compute_covariance()
555
556 def _compute_covariance(self):
~/env/lib/python3.6/site-packages/scipy/stats/kde.py in _compute_covariance(self)
564 bias=False,
565 aweights=self.weights))
--> 566 self._data_inv_cov = linalg.inv(self._data_covariance)
567
568 self.covariance = self._data_covariance * self.factor**2
~/env/lib/python3.6/site-packages/scipy/linalg/basic.py in inv(a, overwrite_a, check_finite)
972 inv_a, info = getri(lu, piv, lwork=lwork, overwrite_lu=1)
973 if info > 0:
--> 974 raise LinAlgError("singular matrix")
975 if info < 0:
976 raise ValueError('illegal value in %d-th argument of internal '
LinAlgError: singular matrix
OS: 10.14.6
Python: 3.6.8.final.0
pd: 0.25.1
np: 1.17.2
sns: 0.9.0
I need to create a three dimensional line chart in which x-axis is dates y-axis is sales and z-axis is ids. I have used the following code:
# Data for a three-dimensional line
zline = df_csv['loc id']
xline = df_csv['date']
yline = df_csv['total_sales']
ax.plot3D(xline, yline, zline, 'gray')
This is the error that it generates:
ValueErrorTraceback (most recent call last)
<ipython-input-10-f2edc42bd85e> in <module>()
4 xline = df_csv['date']
5 yline = df_csv['total_sales']
----> 6 ax.plot3D(xline, yline, zline, 'gray')
7
8 # Data for three-dimensional scattered points
C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\lib\site-packages\mpl_toolkits\mplot3d\axes3d.pyc in plot(self, xs, ys, *args, **kwargs)
1570
1571 xs, ys, zs = art3d.juggle_axes(xs, ys, zs, zdir)
-> 1572 self.auto_scale_xyz(xs, ys, zs, had_data)
1573 return lines
1574
C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\lib\site-packages\mpl_toolkits\mplot3d\axes3d.pyc in auto_scale_xyz(self, X, Y, Z, had_data)
500 # to what the minimum sized rectangular volume holds the
501 # data.
--> 502 self.xy_dataLim.update_from_data_xy(np.array([x, y]).T, not had_data)
503 if z is not None:
504 self.zz_dataLim.update_from_data_xy(np.array([z, z]).T, not had_data)
C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\lib\site-packages\matplotlib\transforms.pyc in update_from_data_xy(self, xy, ignore, updatex, updatey)
925 return
926
--> 927 path = Path(xy)
928 self.update_from_path(path, ignore=ignore,
929 updatex=updatex, updatey=updatey)
C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\lib\site-packages\matplotlib\path.pyc in __init__(self, vertices, codes, _interpolation_steps, closed, readonly)
130 and codes as read-only arrays.
131 """
--> 132 vertices = _to_unmasked_float_array(vertices)
133 if (vertices.ndim != 2) or (vertices.shape[1] != 2):
134 raise ValueError(
C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\lib\site-packages\matplotlib\cbook\__init__.pyc in _to_unmasked_float_array(x)
2050 return np.ma.asarray(x, float).filled(np.nan)
2051 else:
-> 2052 return np.asarray(x, float)
2053
2054
C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\lib\site-packages\numpy\core\numeric.pyc in asarray(a, dtype, order)
536
537 """
--> 538 return array(a, dtype, copy=False, order=order)
539
540
ValueError: invalid literal for float(): 1/1/2016
How do I incorporate dates? It should display the point to exact date and not round it to years.
datetime has to be converted to python datetime first. Here is the solution for anyone else;
xline = matplotlib.dates.date2num(pd.to_datetime(df_csv['date']))
I have been using the matplotlib function plt.hist to generate histogram data from an array of values mV. This has worked fine in the past, but ever since I've updated my version of anaconda it throws back a ValueError:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/users/benjatin/HomeData/Code/buildSkyMap.py in <module>()
72 #get histogram of efficiencies
73 plt.figure()
---> 74 a=plt.hist(mV,bins=50)
75 plt.close()
76
/opt/apps/lsst/feb14/Linux64/anaconda/2.1.0/lib/python2.7/site-packages/matplotlib/pyplot.pyc in hist(x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, hold, **kwargs)
2888 histtype=histtype, align=align, orientation=orientation,
2889 rwidth=rwidth, log=log, color=color, label=label,
-> 2890 stacked=stacked, **kwargs)
2891 draw_if_interactive()
2892 finally:
/opt/apps/lsst/feb14/Linux64/anaconda/2.1.0/lib/python2.7/site-packages/matplotlib/axes/_axes.pyc in hist(self, x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, **kwargs)
5634 # this will automatically overwrite bins,
5635 # so that each histogram uses the same bins
-> 5636 m, bins = np.histogram(x[i], bins, weights=w[i], **hist_kwargs)
5637 m = m.astype(float) # causes problems later if it's an int
5638 if mlast is None:
/opt/apps/lsst/feb14/Linux64/anaconda/2.1.0/lib/python2.7/site-packages/numpy/lib/function_base.pyc in histogram(a, bins, range, normed, weights, density)
598 n.imag += np.bincount(indices, weights=tmp_w.imag, minlength=bins)
599 else:
--> 600 n += np.bincount(indices, weights=tmp_w, minlength=bins).astype(ntype)
601
602 # We now compute the bin edges since these are returned
ValueError: The first argument of bincount must be non-negative
None of the values in mV are negative, as was the problem here:
In [34]: mV[mV < 0]
Out[34]: array([], dtype=float64)
The update I did was:
conda: 3.7.0-py27_0 --> 4.0.5-py27_0 (soft-link)
openssl: 1.0.1h-1 --> 1.0.2h-0 (soft-link)
python: 2.7.8-1 --> 2.7.11-0 (soft-link)
pyyaml: 3.11-py27_0 --> 3.11-py27_1 (soft-link)
requests: 2.4.1-py27_0 --> 2.9.1-py27_0 (soft-link)
sqlite: 3.8.4.1-0 --> 3.9.2-0 (soft-link)
tk: 8.5.15-0 --> 8.5.18-0 (soft-link)
yaml: 0.1.4-0 --> 0.1.6-0 (soft-link)
zlib: 1.2.7-0 --> 1.2.8-0 (soft-link)
Thanks in advance for any help solving this issue.
Filter out any nan and inf from your data before plotting the histogram. See the bug report here.
I'm very new to Python so please excuse any stupidity on my part.
I'm running a histogram with matplotlib and getting errors when I use subset data, the code works perfectly if I use the full dataset, hence my confusion.
Perhaps I'm not subsetting correctly?
My code is below and related errors are below, thanks.
For awareness, this was written in Python 3.
Import required packages:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
Read the data:
mlb=pd.read_csv('C:\Users\ocmh\Desktop\Python\Batting.csv')
View a sample of the data:
mlb.head()
Subset the data to return just Boston data:
mlb_bos=mlb[(mlb['teamID'] == 'BOS')]
View a sample of the subset data:
mlb_bos.head()
Plot a histogram of the original data: And works perfectly
plt.hist(mlb.AB.dropna, color= sns.desaturate("indianred",1))
Plot a histogram of the subset data: Returns errors (Errors are below)
plt.hist(mlb_bos.AB.dropna, color= sns.desaturate("indianred",1))
If you don't have the seaborn package installed, you can just drop color= sns.desaturate("indianred",1) as this was purely for aesthetics.
Errors below:
KeyError Traceback (most recent call last)
<ipython-input-11-1484047d7ac6> in <module>()
----> 1 plt.hist(mlb_bos.AB, color=color)
/Users/mattoconnell/anaconda/lib/python3.4/site-packages/matplotlib/pyplot.py in hist(x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, hold, **kwargs)
2894 histtype=histtype, align=align, orientation=orientation,
2895 rwidth=rwidth, log=log, color=color, label=label,
-> 2896 stacked=stacked, **kwargs)
2897 draw_if_interactive()
2898 finally:
/Users/mattoconnell/anaconda/lib/python3.4/site-packages/matplotlib/axes/_axes.py in hist(self, x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, **kwargs)
5602 # Massage 'x' for processing.
5603 # NOTE: Be sure any changes here is also done below to 'weights'
-> 5604 if isinstance(x, np.ndarray) or not iterable(x[0]):
5605 # TODO: support masked arrays;
5606 x = np.asarray(x)
/Users/mattoconnell/anaconda/lib/python3.4/site-packages/pandas/core/series.py in __getitem__(self, key)
512 def __getitem__(self, key):
513 try:
--> 514 result = self.index.get_value(self, key)
515
516 if not np.isscalar(result):
/Users/mattoconnell/anaconda/lib/python3.4/site-packages/pandas/core/index.py in get_value(self, series, key)
1458
1459 try:
-> 1460 return self._engine.get_value(s, k)
1461 except KeyError as e1:
1462 if len(self) > 0 and self.inferred_type in ['integer','boolean']:
pandas/index.pyx in pandas.index.IndexEngine.get_value (pandas/index.c:3113)()
pandas/index.pyx in pandas.index.IndexEngine.get_value (pandas/index.c:2844)()
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3704)()
pandas/hashtable.pyx in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:7255)()
pandas/hashtable.pyx in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:7193)()
KeyError: 0