I need to create a three dimensional line chart in which x-axis is dates y-axis is sales and z-axis is ids. I have used the following code:
# Data for a three-dimensional line
zline = df_csv['loc id']
xline = df_csv['date']
yline = df_csv['total_sales']
ax.plot3D(xline, yline, zline, 'gray')
This is the error that it generates:
ValueErrorTraceback (most recent call last)
<ipython-input-10-f2edc42bd85e> in <module>()
4 xline = df_csv['date']
5 yline = df_csv['total_sales']
----> 6 ax.plot3D(xline, yline, zline, 'gray')
7
8 # Data for three-dimensional scattered points
C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\lib\site-packages\mpl_toolkits\mplot3d\axes3d.pyc in plot(self, xs, ys, *args, **kwargs)
1570
1571 xs, ys, zs = art3d.juggle_axes(xs, ys, zs, zdir)
-> 1572 self.auto_scale_xyz(xs, ys, zs, had_data)
1573 return lines
1574
C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\lib\site-packages\mpl_toolkits\mplot3d\axes3d.pyc in auto_scale_xyz(self, X, Y, Z, had_data)
500 # to what the minimum sized rectangular volume holds the
501 # data.
--> 502 self.xy_dataLim.update_from_data_xy(np.array([x, y]).T, not had_data)
503 if z is not None:
504 self.zz_dataLim.update_from_data_xy(np.array([z, z]).T, not had_data)
C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\lib\site-packages\matplotlib\transforms.pyc in update_from_data_xy(self, xy, ignore, updatex, updatey)
925 return
926
--> 927 path = Path(xy)
928 self.update_from_path(path, ignore=ignore,
929 updatex=updatex, updatey=updatey)
C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\lib\site-packages\matplotlib\path.pyc in __init__(self, vertices, codes, _interpolation_steps, closed, readonly)
130 and codes as read-only arrays.
131 """
--> 132 vertices = _to_unmasked_float_array(vertices)
133 if (vertices.ndim != 2) or (vertices.shape[1] != 2):
134 raise ValueError(
C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\lib\site-packages\matplotlib\cbook\__init__.pyc in _to_unmasked_float_array(x)
2050 return np.ma.asarray(x, float).filled(np.nan)
2051 else:
-> 2052 return np.asarray(x, float)
2053
2054
C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\lib\site-packages\numpy\core\numeric.pyc in asarray(a, dtype, order)
536
537 """
--> 538 return array(a, dtype, copy=False, order=order)
539
540
ValueError: invalid literal for float(): 1/1/2016
How do I incorporate dates? It should display the point to exact date and not round it to years.
datetime has to be converted to python datetime first. Here is the solution for anyone else;
xline = matplotlib.dates.date2num(pd.to_datetime(df_csv['date']))
Related
I'm trying to replicate the results described in How to Determine the Best Fitting Data Distribution Using Python. I used then the following code:
import numpy as np
from distfit import distfit
# Generate 10000 normal distribution samples with mean 0, std dev of 3
X = np.random.normal(0, 3, 10000)
# Initialize distfit
dist = distfit()
# Determine best-fitting probability distribution for data
dist.fit_transform(X)
Anyway, I obtained the following error:
[distfit] >fit..
[distfit] >transform..
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-8-02f73e7f157d> in <module>
9
10 # Determine best-fitting probability distribution for data
---> 11 dist.fit_transform(X)
~\Anaconda3\lib\site-packages\distfit\distfit.py in fit_transform(self, X, verbose)
275 self.fit(verbose=verbose)
276 # Transform X based on functions
--> 277 self.transform(X, verbose=verbose)
278 # Store
279 results = _store(self.alpha,
~\Anaconda3\lib\site-packages\distfit\distfit.py in transform(self, X, verbose)
214 if self.method=='parametric':
215 # Compute best distribution fit on the empirical X
--> 216 out_summary, model = _compute_score_distribution(X, X_bins, y_obs, self.distributions, self.stats, verbose=verbose)
217 # Determine confidence intervals on the best fitting distribution
218 model = _compute_cii(self, model, verbose=verbose)
~\Anaconda3\lib\site-packages\distfit\distfit.py in _compute_score_distribution(data, X, y_obs, DISTRIBUTIONS, stats, verbose)
906 model['params'] = (0.0, 1.0)
907 best_score = np.inf
--> 908 df = pd.DataFrame(index=range(0, len(DISTRIBUTIONS)), columns=['distr', 'score', 'LLE', 'loc', 'scale', 'arg'])
909 max_name_len = np.max(list(map(lambda x: len(x.name), DISTRIBUTIONS)))
910
~\Anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
346 dtype=dtype, copy=copy)
347 elif isinstance(data, dict):
--> 348 mgr = self._init_dict(data, index, columns, dtype=dtype)
349 elif isinstance(data, ma.MaskedArray):
350 import numpy.ma.mrecords as mrecords
~\Anaconda3\lib\site-packages\pandas\core\frame.py in _init_dict(self, data, index, columns, dtype)
449 nan_dtype = dtype
450 v = construct_1d_arraylike_from_scalar(np.nan, len(index),
--> 451 nan_dtype)
452 arrays.loc[missing] = [v] * missing.sum()
453
~\Anaconda3\lib\site-packages\pandas\core\dtypes\cast.py in construct_1d_arraylike_from_scalar(value, length, dtype)
1194 else:
1195 if not isinstance(dtype, (np.dtype, type(np.dtype))):
-> 1196 dtype = dtype.dtype
1197
1198 # coerce if we have nan for an integer dtype
AttributeError: type object 'object' has no attribute 'dtype'
(I'm using Jupyter.)
How can I fix this problem?
The solution to the above error, as can be seen in the comments of the question, was to upgrade pandas. This issue appears in versions 1.0.4 and lower.
I'm trying to use the (relatively new) Python AutoImpute package, but I keep getting a shape mismatch error when trying to use a particular column as a predictor.
This is what my pandas dataframe looks like
I can impute using the 'sex', 'group', and 'binned_age' columns, but not using the 'experiment' column. When I try doing that, I get this error:
ValueError: shapes (9,) and (4,13) not aligned: 9 (dim 0) != 4 (dim 0)
This is my code for actually fitting and running the imputer:
cat_predictors = ['experiment', 'sex', 'group', 'binned_age']
si = SingleImputer(
strategy={'FSIQ': 'default predictive'},
predictors={'FSIQ': cat_predictors},
)
imputed_data = si.fit_transform(df2)
In trying to diagnose the problem, I found out that if I reduce the number of unique strings in the 'experiment' column to 3 or fewer, my problem goes away for some reason. But, I don't want to do that and lose some of my data. Any help?
Full trace below:
ValueError Traceback (most recent call last)
<ipython-input-11-3d4388ba92e4> in <module>
1 si = SingleImputer(
2 strategy={'FSIQ': 'pmm'}, imp_kwgs={'pmm': {'tune': 10000, 'sample':10000}})
----> 3 data_imputed_once = si.fit_transform(df2)
/om/user/agupta81/anaconda/envs/myenv/lib/python3.8/site-packages/autoimpute/imputations/dataframe/single_imputer.py in fit_transform(self, X, y)
288 X (pd.DataFrame): imputed in place or copy of original.
289 """
--> 290 return self.fit(X, y).transform(X)
/om/user/agupta81/anaconda/envs/myenv/lib/python3.8/site-packages/autoimpute/utils/checks.py in wrapper(d, *args, **kwargs)
59 err = f"Neither {d_err} nor {a_err} are of type pd.DataFrame"
60 raise TypeError(err)
---> 61 return func(d, *args, **kwargs)
62 return wrapper
63
/om/user/agupta81/anaconda/envs/myenv/lib/python3.8/site-packages/autoimpute/utils/checks.py in wrapper(d, *args, **kwargs)
124
125 # return func if no missingness violations detected, then return wrap
--> 126 return func(d, *args, **kwargs)
127 return wrapper
128
/om/user/agupta81/anaconda/envs/myenv/lib/python3.8/site-packages/autoimpute/utils/checks.py in wrapper(d, *args, **kwargs)
171 err = f"All values missing in column(s) {nc}. Should be removed."
172 raise ValueError(err)
--> 173 return func(d, *args, **kwargs)
174 return wrapper
175
/om/user/agupta81/anaconda/envs/myenv/lib/python3.8/site-packages/autoimpute/imputations/dataframe/single_imputer.py in transform(self, X, imp_ixs)
274
275 # perform imputation given the specified imputer and value for x_
--> 276 X.loc[imp_ix, column] = imputer.impute(x_)
277 return X
278
/om/user/agupta81/anaconda/envs/myenv/lib/python3.8/site-packages/autoimpute/imputations/series/pmm.py in impute(self, X)
187 # imputed values are actual y vals corresponding to nearest neighbors
188 # therefore, this is a form of "hot-deck" imputation
--> 189 y_pred_bayes = alpha_bayes + beta_bayes.dot(X.T)
190 n_ = self.neighbors
191 if X.columns.size == 1:
ValueError: shapes (9,) and (4,13) not aligned: 9 (dim 0) != 4 (dim 0)
I am trying to create histograms for feature analysis to see how similar high grade tumor and low grade tumor characteristics are. Something similar to the histogram shown below:
Matrix "Z" looks like this and contains 106 features
Matrix "Z"
The code I am using is:
#Import data
data = pd.read_csv("/Users/MLuser/Desktop/Data table - CM_Transposed.csv")
# Prproccess data
bins = (0,2,4)
group_names = ['low_grade','high_grade']
data['FGrade'] = pd.cut(data['FGrade'], bins=bins, labels=group_names)
label_grade = LabelEncoder()
data['FGrade'] = label_grade.fit_transform(data['FGrade'])
Z=data
# Separate the dataset as response variable and feature variables
Z = Z.drop('Feature Name', axis = 1)
Z.columns
low_grade=Z[Z.FGrade==0] # define malignant
high_grade=Z[Z.FGrade==1] # define benign
for i, col in enumerate(Z.columns):
plt.figure(i)
sns.distplot(low_grade.iloc[:,i], color='red')
sns.distplot(high_grade.iloc[:,i], color='g')
Unfortunately when I run the above, I get a single graph and the following error:
---------------------------------------------------------------------------
LinAlgError Traceback (most recent call last)
<ipython-input-25-79d6e8cd51a8> in <module>
27 for i, col in enumerate(Z.columns):
28 plt.figure(i)
---> 29 sns.distplot(low_grade.iloc[:,i], color='red')
30 sns.distplot(high_grade.iloc[:,i], color='g')
31
~/env/lib/python3.6/site-packages/seaborn/distributions.py in distplot(a, bins, hist, kde, rug, fit, hist_kws, kde_kws, rug_kws, fit_kws, color, vertical, norm_hist, axlabel, label, ax)
229 if kde:
230 kde_color = kde_kws.pop("color", color)
--> 231 kdeplot(a, vertical=vertical, ax=ax, color=kde_color, **kde_kws)
232 if kde_color != color:
233 kde_kws["color"] = kde_color
~/env/lib/python3.6/site-packages/seaborn/distributions.py in kdeplot(data, data2, shade, vertical, kernel, bw, gridsize, cut, clip, legend, cumulative, shade_lowest, cbar, cbar_ax, cbar_kws, ax, **kwargs)
689 ax = _univariate_kdeplot(data, shade, vertical, kernel, bw,
690 gridsize, cut, clip, legend, ax,
--> 691 cumulative=cumulative, **kwargs)
692
693 return ax
~/env/lib/python3.6/site-packages/seaborn/distributions.py in _univariate_kdeplot(data, shade, vertical, kernel, bw, gridsize, cut, clip, legend, ax, cumulative, **kwargs)
292 "only implemented in statsmodels."
293 "Please install statsmodels.")
--> 294 x, y = _scipy_univariate_kde(data, bw, gridsize, cut, clip)
295
296 # Make sure the density is nonnegative
~/env/lib/python3.6/site-packages/seaborn/distributions.py in _scipy_univariate_kde(data, bw, gridsize, cut, clip)
364 """Compute a univariate kernel density estimate using scipy."""
365 try:
--> 366 kde = stats.gaussian_kde(data, bw_method=bw)
367 except TypeError:
368 kde = stats.gaussian_kde(data)
~/env/lib/python3.6/site-packages/scipy/stats/kde.py in __init__(self, dataset, bw_method, weights)
206 self._neff = 1/sum(self._weights**2)
207
--> 208 self.set_bandwidth(bw_method=bw_method)
209
210 def evaluate(self, points):
~/env/lib/python3.6/site-packages/scipy/stats/kde.py in set_bandwidth(self, bw_method)
552 raise ValueError(msg)
553
--> 554 self._compute_covariance()
555
556 def _compute_covariance(self):
~/env/lib/python3.6/site-packages/scipy/stats/kde.py in _compute_covariance(self)
564 bias=False,
565 aweights=self.weights))
--> 566 self._data_inv_cov = linalg.inv(self._data_covariance)
567
568 self.covariance = self._data_covariance * self.factor**2
~/env/lib/python3.6/site-packages/scipy/linalg/basic.py in inv(a, overwrite_a, check_finite)
972 inv_a, info = getri(lu, piv, lwork=lwork, overwrite_lu=1)
973 if info > 0:
--> 974 raise LinAlgError("singular matrix")
975 if info < 0:
976 raise ValueError('illegal value in %d-th argument of internal '
LinAlgError: singular matrix
OS: 10.14.6
Python: 3.6.8.final.0
pd: 0.25.1
np: 1.17.2
sns: 0.9.0
I previously asked How do I style only the last row of a pandas dataframe? and got a perfect answer to the toy problem that I gave.
Turns out I should have made the toy problem a bit closer to my real problem. Consider a dataframe with more than 1 column of text data (which I can apply styling to):
import pandas as pd
import numpy as np
import seaborn as sns
cm = sns.diverging_palette(-5, 5, as_cmap=True)
df = pd.DataFrame(np.random.randn(3, 4))
df['text_column'] = 'a'
df['second_text_column'] = 'b'
df.style.background_gradient(cmap=cm)
However, like the previous question, I wish to only apply this styling to the last row. The answer to the previous question was:
df.style.background_gradient(cmap=cm, subset=df.index[-1])
which in this case gives the error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/usr/local/miniconda/lib/python3.7/site-packages/IPython/core/formatters.py in __call__(self, obj)
343 method = get_real_method(obj, self.print_method)
344 if method is not None:
--> 345 return method()
346 return None
347 else:
/usr/local/miniconda/lib/python3.7/site-packages/pandas/io/formats/style.py in _repr_html_(self)
161 Hooks into Jupyter notebook rich display system.
162 """
--> 163 return self.render()
164
165 #Appender(_shared_docs['to_excel'] % dict(
/usr/local/miniconda/lib/python3.7/site-packages/pandas/io/formats/style.py in render(self, **kwargs)
457 * table_attributes
458 """
--> 459 self._compute()
460 # TODO: namespace all the pandas keys
461 d = self._translate()
/usr/local/miniconda/lib/python3.7/site-packages/pandas/io/formats/style.py in _compute(self)
527 r = self
528 for func, args, kwargs in self._todo:
--> 529 r = func(self)(*args, **kwargs)
530 return r
531
/usr/local/miniconda/lib/python3.7/site-packages/pandas/io/formats/style.py in _apply(self, func, axis, subset, **kwargs)
536 if axis is not None:
537 result = data.apply(func, axis=axis,
--> 538 result_type='expand', **kwargs)
539 result.columns = data.columns
540 else:
/usr/local/miniconda/lib/python3.7/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)
6485 args=args,
6486 kwds=kwds)
-> 6487 return op.get_result()
6488
6489 def applymap(self, func):
/usr/local/miniconda/lib/python3.7/site-packages/pandas/core/apply.py in get_result(self)
149 return self.apply_raw()
150
--> 151 return self.apply_standard()
152
153 def apply_empty_result(self):
/usr/local/miniconda/lib/python3.7/site-packages/pandas/core/apply.py in apply_standard(self)
255
256 # compute the result using the series generator
--> 257 self.apply_series_generator()
258
259 # wrap results
/usr/local/miniconda/lib/python3.7/site-packages/pandas/core/apply.py in apply_series_generator(self)
284 try:
285 for i, v in enumerate(series_gen):
--> 286 results[i] = self.f(v)
287 keys.append(v.name)
288 except Exception as e:
/usr/local/miniconda/lib/python3.7/site-packages/pandas/core/apply.py in f(x)
76
77 def f(x):
---> 78 return func(x, *args, **kwds)
79 else:
80 f = func
/usr/local/miniconda/lib/python3.7/site-packages/pandas/io/formats/style.py in _background_gradient(s, cmap, low, high, text_color_threshold)
941 smin = s.values.min()
942 smax = s.values.max()
--> 943 rng = smax - smin
944 # extend lower / upper bounds, compresses color range
945 norm = colors.Normalize(smin - (rng * low), smax + (rng * high))
TypeError: ("unsupported operand type(s) for -: 'str' and 'str'", 'occurred at index text_column')
<pandas.io.formats.style.Styler at 0x7f948dde7278>
which seems to come from the fact that it's trying to do an operation to strings in the text_column. Fair enough. How do I tell it to only apply to the last row for all non-text columns? I'm ok with giving it explicit column names to use or avoid, but I don't know how to pass that into this inscrutable subset method.
I am running:
python version 3.7.3
pandas version 0.24.2
Using a tuple for subset worked for me, but not sure if it is the most elegant solution:
df.style.background_gradient(cmap=cm,
subset=(df.index[-1], df.select_dtypes(float).columns))
Output:
You want to apply a style on a pandas dataframe and set different colors on differents columns or lines.
Here you can find a code ready to run on your own df. :)
Apply on lines using the axis = 0 and the subset on the df.index or as in this exemple on the columns axis=1 and the subset on the df.columns
cmaps = [
'Greys', 'Purples', 'Blues', 'Greens', 'Oranges', 'Reds',
'YlOrBr', 'YlOrRd', 'OrRd', 'PuRd', 'RdPu', 'BuPu',
'GnBu', 'PuBu', 'YlGnBu', 'PuBuGn', 'BuGn', 'YlGn'
]
df.style.\
background_gradient(
cmap=cmaps[1], axis=0
subset= (
df.index[:],
df.columns[df.columns.get_loc('nb tickets'):df.columns.get_loc('nb ref_prod')+1]
)
).\
background_gradient(
cmap=cmaps[3],
subset= (
df.index[:],
df.columns[df.columns.get_loc('am'):df.columns.get_loc('pm')+1]
)
).\
background_gradient(
cmap=cmaps[4],
subset= (
df.index[:],
df.columns[df.columns.get_loc('Week_1'):df.columns.get_loc('Week_5')+1]
)
).\
background_gradient(
cmap=cmaps[5],
subset= (
df.index[:],
df.columns[df.columns.get_loc('sum qty'):df.columns.get_loc('sum euro')+1]
)
)
I have tried lot of methods and went through many questions that are already answered over here but in vain. None of it could solve my problem. So please if you can't solve this problem at least don't mark it as 'duplicate' because am desparately looking forward to make my model work and am stuck at this stupid syntactical error.
So, my query is pretty simple, I have a data frame that consists of 2 columns 1st is 'sqft_living' and the second is 'price'. So, I have used linear regression to predict price based on its sqft area. I want to visualize this, as scatter plot followed by a best fit line. However, am getting this error:
TypeError: unhashable type: 'numpy.ndarray'
I have already converted the dataframe into series and corrected dimensions as well still am getting this error.
Please provide me with solution code along with exploitation.
Any help will be highly appreciated as I am stuck with this and can't complete my assignment.
below is the exact code and error i am getting.
import numpy as np
import matplotlib.pyplot as plt # To visualize
import pandas as pd # To read data
from sklearn.linear_model import LinearRegression
X = poly1_data.iloc[:, 0].values.reshape(-1, 1) # values converts it into a numpy array
Y = poly1_data.iloc[:, 1].values.reshape(-1, 1) # -1 means that calculate the dimension of rows, but have 1 column
linear_regressor = LinearRegression() # create object for the class
linear_regressor.fit(X, Y) # perform linear regression
Y_pred = linear_regressor.predict(X) # make predictions
plt.scatter(X, Y)
plt.plot(X, Y_pred, color='red')
plt.show()
--------------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-245-96227c9411b1> in <module>
9 linear_regressor.fit(X, Y) # perform linear regression
10 Y_pred = linear_regressor.predict(X) # make predictions
---> 11 plt.scatter(X, Y)
12 plt.plot(X, Y_pred, color='red')
13 plt.show()
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py in scatter(x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, data, **kwargs)
2860 vmin=vmin, vmax=vmax, alpha=alpha, linewidths=linewidths,
2861 verts=verts, edgecolors=edgecolors, **({"data": data} if data
-> 2862 is not None else {}), **kwargs)
2863 sci(__ret)
2864 return __ret
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py in inner(ax, data, *args, **kwargs)
1808 "the Matplotlib list!)" % (label_namer, func.__name__),
1809 RuntimeWarning, stacklevel=2)
-> 1810 return func(ax, *args, **kwargs)
1811
1812 inner.__doc__ = _add_data_doc(inner.__doc__,
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axes\_axes.py in scatter(self, x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, **kwargs)
4170 edgecolors = 'face'
4171
-> 4172 self._process_unit_info(xdata=x, ydata=y, kwargs=kwargs)
4173 x = self.convert_xunits(x)
4174 y = self.convert_yunits(y)
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axes\_base.py in _process_unit_info(self, xdata, ydata, kwargs)
2133 return kwargs
2134
-> 2135 kwargs = _process_single_axis(xdata, self.xaxis, 'xunits', kwargs)
2136 kwargs = _process_single_axis(ydata, self.yaxis, 'yunits', kwargs)
2137 return kwargs
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axes\_base.py in _process_single_axis(data, axis, unit_name, kwargs)
2116 # We only need to update if there is nothing set yet.
2117 if not axis.have_units():
-> 2118 axis.update_units(data)
2119
2120 # Check for units in the kwargs, and if present update axis
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axis.py in update_units(self, data)
1471 neednew = self.converter != converter
1472 self.converter = converter
-> 1473 default = self.converter.default_units(data, self)
1474 if default is not None and self.units is None:
1475 self.set_units(default)
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\category.py in default_units(data, axis)
101 # default_units->axis_info->convert
102 if axis.units is None:
--> 103 axis.set_units(UnitData(data))
104 else:
105 axis.units.update(data)
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\category.py in __init__(self, data)
167 self._counter = itertools.count()
168 if data is not None:
--> 169 self.update(data)
170
171 def update(self, data):
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\category.py in update(self, data)
184 data = np.atleast_1d(np.array(data, dtype=object))
185
--> 186 for val in OrderedDict.fromkeys(data):
187 if not isinstance(val, (str, bytes)):
188 raise TypeError("{val!r} is not a string".format(val=val))
TypeError: unhashable type: 'numpy.ndarray'
````[I am getting this image without any scatter plot and best fit line][1]
[1]: https://i.stack.imgur.com/9jccu.png