I am trying to create histograms for feature analysis to see how similar high grade tumor and low grade tumor characteristics are. Something similar to the histogram shown below:
Matrix "Z" looks like this and contains 106 features
Matrix "Z"
The code I am using is:
#Import data
data = pd.read_csv("/Users/MLuser/Desktop/Data table - CM_Transposed.csv")
# Prproccess data
bins = (0,2,4)
group_names = ['low_grade','high_grade']
data['FGrade'] = pd.cut(data['FGrade'], bins=bins, labels=group_names)
label_grade = LabelEncoder()
data['FGrade'] = label_grade.fit_transform(data['FGrade'])
Z=data
# Separate the dataset as response variable and feature variables
Z = Z.drop('Feature Name', axis = 1)
Z.columns
low_grade=Z[Z.FGrade==0] # define malignant
high_grade=Z[Z.FGrade==1] # define benign
for i, col in enumerate(Z.columns):
plt.figure(i)
sns.distplot(low_grade.iloc[:,i], color='red')
sns.distplot(high_grade.iloc[:,i], color='g')
Unfortunately when I run the above, I get a single graph and the following error:
---------------------------------------------------------------------------
LinAlgError Traceback (most recent call last)
<ipython-input-25-79d6e8cd51a8> in <module>
27 for i, col in enumerate(Z.columns):
28 plt.figure(i)
---> 29 sns.distplot(low_grade.iloc[:,i], color='red')
30 sns.distplot(high_grade.iloc[:,i], color='g')
31
~/env/lib/python3.6/site-packages/seaborn/distributions.py in distplot(a, bins, hist, kde, rug, fit, hist_kws, kde_kws, rug_kws, fit_kws, color, vertical, norm_hist, axlabel, label, ax)
229 if kde:
230 kde_color = kde_kws.pop("color", color)
--> 231 kdeplot(a, vertical=vertical, ax=ax, color=kde_color, **kde_kws)
232 if kde_color != color:
233 kde_kws["color"] = kde_color
~/env/lib/python3.6/site-packages/seaborn/distributions.py in kdeplot(data, data2, shade, vertical, kernel, bw, gridsize, cut, clip, legend, cumulative, shade_lowest, cbar, cbar_ax, cbar_kws, ax, **kwargs)
689 ax = _univariate_kdeplot(data, shade, vertical, kernel, bw,
690 gridsize, cut, clip, legend, ax,
--> 691 cumulative=cumulative, **kwargs)
692
693 return ax
~/env/lib/python3.6/site-packages/seaborn/distributions.py in _univariate_kdeplot(data, shade, vertical, kernel, bw, gridsize, cut, clip, legend, ax, cumulative, **kwargs)
292 "only implemented in statsmodels."
293 "Please install statsmodels.")
--> 294 x, y = _scipy_univariate_kde(data, bw, gridsize, cut, clip)
295
296 # Make sure the density is nonnegative
~/env/lib/python3.6/site-packages/seaborn/distributions.py in _scipy_univariate_kde(data, bw, gridsize, cut, clip)
364 """Compute a univariate kernel density estimate using scipy."""
365 try:
--> 366 kde = stats.gaussian_kde(data, bw_method=bw)
367 except TypeError:
368 kde = stats.gaussian_kde(data)
~/env/lib/python3.6/site-packages/scipy/stats/kde.py in __init__(self, dataset, bw_method, weights)
206 self._neff = 1/sum(self._weights**2)
207
--> 208 self.set_bandwidth(bw_method=bw_method)
209
210 def evaluate(self, points):
~/env/lib/python3.6/site-packages/scipy/stats/kde.py in set_bandwidth(self, bw_method)
552 raise ValueError(msg)
553
--> 554 self._compute_covariance()
555
556 def _compute_covariance(self):
~/env/lib/python3.6/site-packages/scipy/stats/kde.py in _compute_covariance(self)
564 bias=False,
565 aweights=self.weights))
--> 566 self._data_inv_cov = linalg.inv(self._data_covariance)
567
568 self.covariance = self._data_covariance * self.factor**2
~/env/lib/python3.6/site-packages/scipy/linalg/basic.py in inv(a, overwrite_a, check_finite)
972 inv_a, info = getri(lu, piv, lwork=lwork, overwrite_lu=1)
973 if info > 0:
--> 974 raise LinAlgError("singular matrix")
975 if info < 0:
976 raise ValueError('illegal value in %d-th argument of internal '
LinAlgError: singular matrix
OS: 10.14.6
Python: 3.6.8.final.0
pd: 0.25.1
np: 1.17.2
sns: 0.9.0
Related
This error keeps coming when I try to find MI values. My code is as follows
X_new = X.copy()
X_new = X_new.fillna(0)
y = data.SalePrice
def make_mi_scores(X, y):
X = X.copy()
for colname in X.select_dtypes(["object", "category"]):
X[colname], _ = X[colname].factorize()
discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
mi_scores = mi_scores.sort_values(ascending=False)
return mi_scores
def plot_mi_scores(scores):
scores = scores.sort_values(ascending=True)
width = np.arange(len(scores))
ticks = list(scores.index)
plt.barh(width, scores)
plt.yticks(width, ticks)
plt.title("Mutual Information Scores")
plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(make_mi_scores(X_new,y))
if you want the full notebook here is a link https://www.kaggle.com/code/snigdhkarki/house-price-competition
The error is as follows
ValueError Traceback (most recent call last)
/tmp/ipykernel_19/1575243112.py in <module>
42
43 plt.figure(dpi=100, figsize=(8, 5))
---> 44 plot_mi_scores(make_mi_scores(X_new,y))
/tmp/ipykernel_19/1575243112.py in make_mi_scores(X, y)
28 print(X.isnull().any().any())
29 print(y.isnull().any().any())
---> 30 mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
31 mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
32 mi_scores = mi_scores.sort_values(ascending=False)
/opt/conda/lib/python3.7/site-packages/sklearn/feature_selection/_mutual_info.py in mutual_info_regression(X, y, discrete_features, n_neighbors, copy, random_state)
382 of a Random Vector", Probl. Peredachi Inf., 23:2 (1987), 9-16
383 """
--> 384 return _estimate_mi(X, y, discrete_features, False, n_neighbors, copy, random_state)
385
386
/opt/conda/lib/python3.7/site-packages/sklearn/feature_selection/_mutual_info.py in _estimate_mi(X, y, discrete_features, discrete_target, n_neighbors, copy, random_state)
300 mi = [
301 _compute_mi(x, y, discrete_feature, discrete_target, n_neighbors)
--> 302 for x, discrete_feature in zip(_iterate_columns(X), discrete_mask)
303 ]
304
/opt/conda/lib/python3.7/site-packages/sklearn/feature_selection/_mutual_info.py in <listcomp>(.0)
300 mi = [
301 _compute_mi(x, y, discrete_feature, discrete_target, n_neighbors)
--> 302 for x, discrete_feature in zip(_iterate_columns(X), discrete_mask)
303 ]
304
/opt/conda/lib/python3.7/site-packages/sklearn/feature_selection/_mutual_info.py in _compute_mi(x, y, x_discrete, y_discrete, n_neighbors)
160 return mutual_info_score(x, y)
161 elif x_discrete and not y_discrete:
--> 162 return _compute_mi_cd(y, x, n_neighbors)
163 elif not x_discrete and y_discrete:
164 return _compute_mi_cd(x, y, n_neighbors)
/opt/conda/lib/python3.7/site-packages/sklearn/feature_selection/_mutual_info.py in _compute_mi_cd(c, d, n_neighbors)
137 radius = radius[mask]
138
--> 139 kd = KDTree(c)
140 m_all = kd.query_radius(c, radius, count_only=True, return_distance=False)
141 m_all = np.array(m_all) - 1.0
sklearn/neighbors/_binary_tree.pxi in sklearn.neighbors._kd_tree.BinaryTree.__init__()
/opt/conda/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
806 "Found array with %d sample(s) (shape=%s) while a"
807 " minimum of %d is required%s."
--> 808 % (n_samples, array.shape, ensure_min_samples, context)
809 )
810
ValueError: Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required.
There were only a few places where this question was asked and even in those places I was unable to find any answers to my question
The issue is arising where you are calling mutual_info_regression over here -
mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
As per sklearn's documentation, the parameter discrete_features should be a boolean mask that has True for discrete variables and False otherwise.
I checked your Kaggle code and it seems like your technique for identifying discrete and continuous features in your data frame is wrong.
A simple hack to get the code running would be to identify all features as continuous using the following code -
discrete_features = [False]*73
# 73 is the number of columns X has
However, the result might be wrong if the mutual_info_regression algorithm requires you to accurately identify discrete and continuous features.
I need to create a three dimensional line chart in which x-axis is dates y-axis is sales and z-axis is ids. I have used the following code:
# Data for a three-dimensional line
zline = df_csv['loc id']
xline = df_csv['date']
yline = df_csv['total_sales']
ax.plot3D(xline, yline, zline, 'gray')
This is the error that it generates:
ValueErrorTraceback (most recent call last)
<ipython-input-10-f2edc42bd85e> in <module>()
4 xline = df_csv['date']
5 yline = df_csv['total_sales']
----> 6 ax.plot3D(xline, yline, zline, 'gray')
7
8 # Data for three-dimensional scattered points
C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\lib\site-packages\mpl_toolkits\mplot3d\axes3d.pyc in plot(self, xs, ys, *args, **kwargs)
1570
1571 xs, ys, zs = art3d.juggle_axes(xs, ys, zs, zdir)
-> 1572 self.auto_scale_xyz(xs, ys, zs, had_data)
1573 return lines
1574
C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\lib\site-packages\mpl_toolkits\mplot3d\axes3d.pyc in auto_scale_xyz(self, X, Y, Z, had_data)
500 # to what the minimum sized rectangular volume holds the
501 # data.
--> 502 self.xy_dataLim.update_from_data_xy(np.array([x, y]).T, not had_data)
503 if z is not None:
504 self.zz_dataLim.update_from_data_xy(np.array([z, z]).T, not had_data)
C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\lib\site-packages\matplotlib\transforms.pyc in update_from_data_xy(self, xy, ignore, updatex, updatey)
925 return
926
--> 927 path = Path(xy)
928 self.update_from_path(path, ignore=ignore,
929 updatex=updatex, updatey=updatey)
C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\lib\site-packages\matplotlib\path.pyc in __init__(self, vertices, codes, _interpolation_steps, closed, readonly)
130 and codes as read-only arrays.
131 """
--> 132 vertices = _to_unmasked_float_array(vertices)
133 if (vertices.ndim != 2) or (vertices.shape[1] != 2):
134 raise ValueError(
C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\lib\site-packages\matplotlib\cbook\__init__.pyc in _to_unmasked_float_array(x)
2050 return np.ma.asarray(x, float).filled(np.nan)
2051 else:
-> 2052 return np.asarray(x, float)
2053
2054
C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\lib\site-packages\numpy\core\numeric.pyc in asarray(a, dtype, order)
536
537 """
--> 538 return array(a, dtype, copy=False, order=order)
539
540
ValueError: invalid literal for float(): 1/1/2016
How do I incorporate dates? It should display the point to exact date and not round it to years.
datetime has to be converted to python datetime first. Here is the solution for anyone else;
xline = matplotlib.dates.date2num(pd.to_datetime(df_csv['date']))
I have tried lot of methods and went through many questions that are already answered over here but in vain. None of it could solve my problem. So please if you can't solve this problem at least don't mark it as 'duplicate' because am desparately looking forward to make my model work and am stuck at this stupid syntactical error.
So, my query is pretty simple, I have a data frame that consists of 2 columns 1st is 'sqft_living' and the second is 'price'. So, I have used linear regression to predict price based on its sqft area. I want to visualize this, as scatter plot followed by a best fit line. However, am getting this error:
TypeError: unhashable type: 'numpy.ndarray'
I have already converted the dataframe into series and corrected dimensions as well still am getting this error.
Please provide me with solution code along with exploitation.
Any help will be highly appreciated as I am stuck with this and can't complete my assignment.
below is the exact code and error i am getting.
import numpy as np
import matplotlib.pyplot as plt # To visualize
import pandas as pd # To read data
from sklearn.linear_model import LinearRegression
X = poly1_data.iloc[:, 0].values.reshape(-1, 1) # values converts it into a numpy array
Y = poly1_data.iloc[:, 1].values.reshape(-1, 1) # -1 means that calculate the dimension of rows, but have 1 column
linear_regressor = LinearRegression() # create object for the class
linear_regressor.fit(X, Y) # perform linear regression
Y_pred = linear_regressor.predict(X) # make predictions
plt.scatter(X, Y)
plt.plot(X, Y_pred, color='red')
plt.show()
--------------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-245-96227c9411b1> in <module>
9 linear_regressor.fit(X, Y) # perform linear regression
10 Y_pred = linear_regressor.predict(X) # make predictions
---> 11 plt.scatter(X, Y)
12 plt.plot(X, Y_pred, color='red')
13 plt.show()
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py in scatter(x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, data, **kwargs)
2860 vmin=vmin, vmax=vmax, alpha=alpha, linewidths=linewidths,
2861 verts=verts, edgecolors=edgecolors, **({"data": data} if data
-> 2862 is not None else {}), **kwargs)
2863 sci(__ret)
2864 return __ret
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py in inner(ax, data, *args, **kwargs)
1808 "the Matplotlib list!)" % (label_namer, func.__name__),
1809 RuntimeWarning, stacklevel=2)
-> 1810 return func(ax, *args, **kwargs)
1811
1812 inner.__doc__ = _add_data_doc(inner.__doc__,
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axes\_axes.py in scatter(self, x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, **kwargs)
4170 edgecolors = 'face'
4171
-> 4172 self._process_unit_info(xdata=x, ydata=y, kwargs=kwargs)
4173 x = self.convert_xunits(x)
4174 y = self.convert_yunits(y)
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axes\_base.py in _process_unit_info(self, xdata, ydata, kwargs)
2133 return kwargs
2134
-> 2135 kwargs = _process_single_axis(xdata, self.xaxis, 'xunits', kwargs)
2136 kwargs = _process_single_axis(ydata, self.yaxis, 'yunits', kwargs)
2137 return kwargs
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axes\_base.py in _process_single_axis(data, axis, unit_name, kwargs)
2116 # We only need to update if there is nothing set yet.
2117 if not axis.have_units():
-> 2118 axis.update_units(data)
2119
2120 # Check for units in the kwargs, and if present update axis
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axis.py in update_units(self, data)
1471 neednew = self.converter != converter
1472 self.converter = converter
-> 1473 default = self.converter.default_units(data, self)
1474 if default is not None and self.units is None:
1475 self.set_units(default)
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\category.py in default_units(data, axis)
101 # default_units->axis_info->convert
102 if axis.units is None:
--> 103 axis.set_units(UnitData(data))
104 else:
105 axis.units.update(data)
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\category.py in __init__(self, data)
167 self._counter = itertools.count()
168 if data is not None:
--> 169 self.update(data)
170
171 def update(self, data):
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\category.py in update(self, data)
184 data = np.atleast_1d(np.array(data, dtype=object))
185
--> 186 for val in OrderedDict.fromkeys(data):
187 if not isinstance(val, (str, bytes)):
188 raise TypeError("{val!r} is not a string".format(val=val))
TypeError: unhashable type: 'numpy.ndarray'
````[I am getting this image without any scatter plot and best fit line][1]
[1]: https://i.stack.imgur.com/9jccu.png
I'm trying to use pymc3 with a likelihood function derived from some observed data. This observed data doesn't fit any nice, standard distribution, so I want to define my own, based on these observations.
One approach is to use kernel density estimation over the observations. This was possible in pymc2, but doesn't play nicely with the Theano variables in pymc3.
In my code below I'm just generating some dummy data that is normally distributed. As my prior, I'm essentially assuming a uniform distribution for my observations.
Here's my code:
from scipy import stats
import numpy as np
import pymc3 as pm
from sklearn.neighbors.kde import KernelDensity
data = np.sort(stats.norm.rvs(loc=0, scale=1, size=1000))
kde = KernelDensity(kernel='gaussian', bandwidth=0.1).fit(data.reshape(-1, 1))
def get_log_likelihood(x):
return kde.score_samples(x)
with pm.Model() as test_model:
x = pm.Uniform('prior rv', lower=-10, upper=10)
obs = pm.DensityDist('observed likelihood', get_log_likelihood, observed={'x': x})
step = pm.Metropolis()
trace = pm.sample(200, step=step)
The error I receive seems to be the kde score_samples function blowing up as it expects an array, but x is a Theano variable.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-49-4efbbe7376dc> in <module>()
1 with pm.Model() as test_model:
2 x = pm.Uniform('prior rv', lower=0.0, upper=1e6)
----> 3 obs = pm.DensityDist('observed likelihood', get_log_likelihood, observed={'x': x})
4
5 step = pm.Metropolis()
~/research_notebooks/venv/lib/python3.6/site-packages/pymc3/distributions/distribution.py in __new__(cls, name, *args, **kwargs)
40 total_size = kwargs.pop('total_size', None)
41 dist = cls.dist(*args, **kwargs)
---> 42 return model.Var(name, dist, data, total_size)
43 else:
44 raise TypeError("Name needs to be a string but got: {}".format(name))
~/research_notebooks/venv/lib/python3.6/site-packages/pymc3/model.py in Var(self, name, dist, data, total_size)
825 with self:
826 var = MultiObservedRV(name=name, data=data, distribution=dist,
--> 827 total_size=total_size, model=self)
828 self.observed_RVs.append(var)
829 if var.missing_values:
~/research_notebooks/venv/lib/python3.6/site-packages/pymc3/model.py in __init__(self, name, data, distribution, total_size, model)
1372 self.missing_values = [datum.missing_values for datum in self.data.values()
1373 if datum.missing_values is not None]
-> 1374 self.logp_elemwiset = distribution.logp(**self.data)
1375 # The logp might need scaling in minibatches.
1376 # This is done in `Factor`.
<ipython-input-48-535f58ce543b> in get_log_likelihood(x)
1 def get_log_likelihood(x):
----> 2 return kde.score_samples(x)
~/research_notebooks/venv/lib/python3.6/site-packages/sklearn/neighbors/kde.py in score_samples(self, X)
150 # For it to be a probability, we must scale it. For this reason
151 # we'll also scale atol.
--> 152 X = check_array(X, order='C', dtype=DTYPE)
153 N = self.tree_.data.shape[0]
154 atol_N = self.atol * N
~/research_notebooks/venv/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
431 force_all_finite)
432 else:
--> 433 array = np.array(array, dtype=dtype, order=order, copy=copy)
434
435 if ensure_2d:
ValueError: setting an array element with a sequence.
Any help would be greatly appreciated. Thanks!
I have been using the matplotlib function plt.hist to generate histogram data from an array of values mV. This has worked fine in the past, but ever since I've updated my version of anaconda it throws back a ValueError:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/users/benjatin/HomeData/Code/buildSkyMap.py in <module>()
72 #get histogram of efficiencies
73 plt.figure()
---> 74 a=plt.hist(mV,bins=50)
75 plt.close()
76
/opt/apps/lsst/feb14/Linux64/anaconda/2.1.0/lib/python2.7/site-packages/matplotlib/pyplot.pyc in hist(x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, hold, **kwargs)
2888 histtype=histtype, align=align, orientation=orientation,
2889 rwidth=rwidth, log=log, color=color, label=label,
-> 2890 stacked=stacked, **kwargs)
2891 draw_if_interactive()
2892 finally:
/opt/apps/lsst/feb14/Linux64/anaconda/2.1.0/lib/python2.7/site-packages/matplotlib/axes/_axes.pyc in hist(self, x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, **kwargs)
5634 # this will automatically overwrite bins,
5635 # so that each histogram uses the same bins
-> 5636 m, bins = np.histogram(x[i], bins, weights=w[i], **hist_kwargs)
5637 m = m.astype(float) # causes problems later if it's an int
5638 if mlast is None:
/opt/apps/lsst/feb14/Linux64/anaconda/2.1.0/lib/python2.7/site-packages/numpy/lib/function_base.pyc in histogram(a, bins, range, normed, weights, density)
598 n.imag += np.bincount(indices, weights=tmp_w.imag, minlength=bins)
599 else:
--> 600 n += np.bincount(indices, weights=tmp_w, minlength=bins).astype(ntype)
601
602 # We now compute the bin edges since these are returned
ValueError: The first argument of bincount must be non-negative
None of the values in mV are negative, as was the problem here:
In [34]: mV[mV < 0]
Out[34]: array([], dtype=float64)
The update I did was:
conda: 3.7.0-py27_0 --> 4.0.5-py27_0 (soft-link)
openssl: 1.0.1h-1 --> 1.0.2h-0 (soft-link)
python: 2.7.8-1 --> 2.7.11-0 (soft-link)
pyyaml: 3.11-py27_0 --> 3.11-py27_1 (soft-link)
requests: 2.4.1-py27_0 --> 2.9.1-py27_0 (soft-link)
sqlite: 3.8.4.1-0 --> 3.9.2-0 (soft-link)
tk: 8.5.15-0 --> 8.5.18-0 (soft-link)
yaml: 0.1.4-0 --> 0.1.6-0 (soft-link)
zlib: 1.2.7-0 --> 1.2.8-0 (soft-link)
Thanks in advance for any help solving this issue.
Filter out any nan and inf from your data before plotting the histogram. See the bug report here.