How do I style a subset of a pandas dataframe? - python

I previously asked How do I style only the last row of a pandas dataframe? and got a perfect answer to the toy problem that I gave.
Turns out I should have made the toy problem a bit closer to my real problem. Consider a dataframe with more than 1 column of text data (which I can apply styling to):
import pandas as pd
import numpy as np
import seaborn as sns
cm = sns.diverging_palette(-5, 5, as_cmap=True)
df = pd.DataFrame(np.random.randn(3, 4))
df['text_column'] = 'a'
df['second_text_column'] = 'b'
df.style.background_gradient(cmap=cm)
However, like the previous question, I wish to only apply this styling to the last row. The answer to the previous question was:
df.style.background_gradient(cmap=cm, subset=df.index[-1])
which in this case gives the error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/usr/local/miniconda/lib/python3.7/site-packages/IPython/core/formatters.py in __call__(self, obj)
343 method = get_real_method(obj, self.print_method)
344 if method is not None:
--> 345 return method()
346 return None
347 else:
/usr/local/miniconda/lib/python3.7/site-packages/pandas/io/formats/style.py in _repr_html_(self)
161 Hooks into Jupyter notebook rich display system.
162 """
--> 163 return self.render()
164
165 #Appender(_shared_docs['to_excel'] % dict(
/usr/local/miniconda/lib/python3.7/site-packages/pandas/io/formats/style.py in render(self, **kwargs)
457 * table_attributes
458 """
--> 459 self._compute()
460 # TODO: namespace all the pandas keys
461 d = self._translate()
/usr/local/miniconda/lib/python3.7/site-packages/pandas/io/formats/style.py in _compute(self)
527 r = self
528 for func, args, kwargs in self._todo:
--> 529 r = func(self)(*args, **kwargs)
530 return r
531
/usr/local/miniconda/lib/python3.7/site-packages/pandas/io/formats/style.py in _apply(self, func, axis, subset, **kwargs)
536 if axis is not None:
537 result = data.apply(func, axis=axis,
--> 538 result_type='expand', **kwargs)
539 result.columns = data.columns
540 else:
/usr/local/miniconda/lib/python3.7/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)
6485 args=args,
6486 kwds=kwds)
-> 6487 return op.get_result()
6488
6489 def applymap(self, func):
/usr/local/miniconda/lib/python3.7/site-packages/pandas/core/apply.py in get_result(self)
149 return self.apply_raw()
150
--> 151 return self.apply_standard()
152
153 def apply_empty_result(self):
/usr/local/miniconda/lib/python3.7/site-packages/pandas/core/apply.py in apply_standard(self)
255
256 # compute the result using the series generator
--> 257 self.apply_series_generator()
258
259 # wrap results
/usr/local/miniconda/lib/python3.7/site-packages/pandas/core/apply.py in apply_series_generator(self)
284 try:
285 for i, v in enumerate(series_gen):
--> 286 results[i] = self.f(v)
287 keys.append(v.name)
288 except Exception as e:
/usr/local/miniconda/lib/python3.7/site-packages/pandas/core/apply.py in f(x)
76
77 def f(x):
---> 78 return func(x, *args, **kwds)
79 else:
80 f = func
/usr/local/miniconda/lib/python3.7/site-packages/pandas/io/formats/style.py in _background_gradient(s, cmap, low, high, text_color_threshold)
941 smin = s.values.min()
942 smax = s.values.max()
--> 943 rng = smax - smin
944 # extend lower / upper bounds, compresses color range
945 norm = colors.Normalize(smin - (rng * low), smax + (rng * high))
TypeError: ("unsupported operand type(s) for -: 'str' and 'str'", 'occurred at index text_column')
<pandas.io.formats.style.Styler at 0x7f948dde7278>
which seems to come from the fact that it's trying to do an operation to strings in the text_column. Fair enough. How do I tell it to only apply to the last row for all non-text columns? I'm ok with giving it explicit column names to use or avoid, but I don't know how to pass that into this inscrutable subset method.
I am running:
python version 3.7.3
pandas version 0.24.2

Using a tuple for subset worked for me, but not sure if it is the most elegant solution:
df.style.background_gradient(cmap=cm,
subset=(df.index[-1], df.select_dtypes(float).columns))
Output:

You want to apply a style on a pandas dataframe and set different colors on differents columns or lines.
Here you can find a code ready to run on your own df. :)
Apply on lines using the axis = 0 and the subset on the df.index or as in this exemple on the columns axis=1 and the subset on the df.columns
cmaps = [
'Greys', 'Purples', 'Blues', 'Greens', 'Oranges', 'Reds',
'YlOrBr', 'YlOrRd', 'OrRd', 'PuRd', 'RdPu', 'BuPu',
'GnBu', 'PuBu', 'YlGnBu', 'PuBuGn', 'BuGn', 'YlGn'
]
df.style.\
background_gradient(
cmap=cmaps[1], axis=0
subset= (
df.index[:],
df.columns[df.columns.get_loc('nb tickets'):df.columns.get_loc('nb ref_prod')+1]
)
).\
background_gradient(
cmap=cmaps[3],
subset= (
df.index[:],
df.columns[df.columns.get_loc('am'):df.columns.get_loc('pm')+1]
)
).\
background_gradient(
cmap=cmaps[4],
subset= (
df.index[:],
df.columns[df.columns.get_loc('Week_1'):df.columns.get_loc('Week_5')+1]
)
).\
background_gradient(
cmap=cmaps[5],
subset= (
df.index[:],
df.columns[df.columns.get_loc('sum qty'):df.columns.get_loc('sum euro')+1]
)
)

Related

AttributeError: type object 'object' has no attribute 'dtype'

I'm trying to replicate the results described in How to Determine the Best Fitting Data Distribution Using Python. I used then the following code:
import numpy as np
from distfit import distfit
# Generate 10000 normal distribution samples with mean 0, std dev of 3
X = np.random.normal(0, 3, 10000)
# Initialize distfit
dist = distfit()
# Determine best-fitting probability distribution for data
dist.fit_transform(X)
Anyway, I obtained the following error:
[distfit] >fit..
[distfit] >transform..
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-8-02f73e7f157d> in <module>
9
10 # Determine best-fitting probability distribution for data
---> 11 dist.fit_transform(X)
~\Anaconda3\lib\site-packages\distfit\distfit.py in fit_transform(self, X, verbose)
275 self.fit(verbose=verbose)
276 # Transform X based on functions
--> 277 self.transform(X, verbose=verbose)
278 # Store
279 results = _store(self.alpha,
~\Anaconda3\lib\site-packages\distfit\distfit.py in transform(self, X, verbose)
214 if self.method=='parametric':
215 # Compute best distribution fit on the empirical X
--> 216 out_summary, model = _compute_score_distribution(X, X_bins, y_obs, self.distributions, self.stats, verbose=verbose)
217 # Determine confidence intervals on the best fitting distribution
218 model = _compute_cii(self, model, verbose=verbose)
~\Anaconda3\lib\site-packages\distfit\distfit.py in _compute_score_distribution(data, X, y_obs, DISTRIBUTIONS, stats, verbose)
906 model['params'] = (0.0, 1.0)
907 best_score = np.inf
--> 908 df = pd.DataFrame(index=range(0, len(DISTRIBUTIONS)), columns=['distr', 'score', 'LLE', 'loc', 'scale', 'arg'])
909 max_name_len = np.max(list(map(lambda x: len(x.name), DISTRIBUTIONS)))
910
~\Anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
346 dtype=dtype, copy=copy)
347 elif isinstance(data, dict):
--> 348 mgr = self._init_dict(data, index, columns, dtype=dtype)
349 elif isinstance(data, ma.MaskedArray):
350 import numpy.ma.mrecords as mrecords
~\Anaconda3\lib\site-packages\pandas\core\frame.py in _init_dict(self, data, index, columns, dtype)
449 nan_dtype = dtype
450 v = construct_1d_arraylike_from_scalar(np.nan, len(index),
--> 451 nan_dtype)
452 arrays.loc[missing] = [v] * missing.sum()
453
~\Anaconda3\lib\site-packages\pandas\core\dtypes\cast.py in construct_1d_arraylike_from_scalar(value, length, dtype)
1194 else:
1195 if not isinstance(dtype, (np.dtype, type(np.dtype))):
-> 1196 dtype = dtype.dtype
1197
1198 # coerce if we have nan for an integer dtype
AttributeError: type object 'object' has no attribute 'dtype'
(I'm using Jupyter.)
How can I fix this problem?
The solution to the above error, as can be seen in the comments of the question, was to upgrade pandas. This issue appears in versions 1.0.4 and lower.

Kaggle ASL Dataset: ValueError: If using all scalar values, you must pass an index

I am trying to work on the Kaggle ASL Dataset, and during preprocessing, I tried to scale the values against each pixel.
I did the following steps in Google Colab:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
train = pd.read_csv("sign-language-mnist/sign_mnist_train.csv")
scaler = MinMaxScaler()
new_df = train.apply(lambda x: scaler.fit_transform(x.values.reshape(1,-1)),axis=0)
While trying to run this piece of code, I am getting the following error:
/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
7550 kwds=kwds,
7551 )
-> 7552 return op.get_result()
7553
7554 def applymap(self, func) -> "DataFrame":
/usr/local/lib/python3.6/dist-packages/pandas/core/apply.py in get_result(self)
178 return self.apply_raw()
179
--> 180 return self.apply_standard()
181
182 def apply_empty_result(self):
/usr/local/lib/python3.6/dist-packages/pandas/core/apply.py in apply_standard(self)
272
273 # wrap results
--> 274 return self.wrap_results(results, res_index)
275
276 def apply_series_generator(self) -> Tuple[ResType, "Index"]:
/usr/local/lib/python3.6/dist-packages/pandas/core/apply.py in wrap_results(self, results, res_index)
313 # see if we can infer the results
314 if len(results) > 0 and 0 in results and is_sequence(results[0]):
--> 315 return self.wrap_results_for_axis(results, res_index)
316
317 # dict of scalars
/usr/local/lib/python3.6/dist-packages/pandas/core/apply.py in wrap_results_for_axis(self, results, res_index)
369
370 try:
--> 371 result = self.obj._constructor(data=results)
372 except ValueError as err:
373 if "arrays must all be same length" in str(err):
/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
466
467 elif isinstance(data, dict):
--> 468 mgr = init_dict(data, index, columns, dtype=dtype)
469 elif isinstance(data, ma.MaskedArray):
470 import numpy.ma.mrecords as mrecords
/usr/local/lib/python3.6/dist-packages/pandas/core/internals/construction.py in init_dict(data, index, columns, dtype)
281 arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
282 ]
--> 283 return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
284
285
/usr/local/lib/python3.6/dist-packages/pandas/core/internals/construction.py in arrays_to_mgr(arrays, arr_names, index, columns, dtype, verify_integrity)
76 # figure out the index, if necessary
77 if index is None:
---> 78 index = extract_index(arrays)
79 else:
80 index = ensure_index(index)
/usr/local/lib/python3.6/dist-packages/pandas/core/internals/construction.py in extract_index(data)
385
386 if not indexes and not raw_lengths:
--> 387 raise ValueError("If using all scalar values, you must pass an index")
388
389 if have_series:
ValueError: If using all scalar values, you must pass an index
However, the following piece of code works fine:
new_df = pd.DataFrame(scaler.fit_transform(train), columns=train.columns)
So, the question is what is going wrong? Can anyone please answer that?
or, Can anyone explain, what I need to know to find out why the first one was giving that weird error?
Thanks in advance.
You could try
train.iloc[:,1:] = scaler.fit_transform(train.iloc[:,1:])
Anyway you wouldn't want to scale the label value too.

Shapes not aligned in Python AutoImpute data imputation package?

I'm trying to use the (relatively new) Python AutoImpute package, but I keep getting a shape mismatch error when trying to use a particular column as a predictor.
This is what my pandas dataframe looks like
I can impute using the 'sex', 'group', and 'binned_age' columns, but not using the 'experiment' column. When I try doing that, I get this error:
ValueError: shapes (9,) and (4,13) not aligned: 9 (dim 0) != 4 (dim 0)
This is my code for actually fitting and running the imputer:
cat_predictors = ['experiment', 'sex', 'group', 'binned_age']
si = SingleImputer(
strategy={'FSIQ': 'default predictive'},
predictors={'FSIQ': cat_predictors},
)
imputed_data = si.fit_transform(df2)
In trying to diagnose the problem, I found out that if I reduce the number of unique strings in the 'experiment' column to 3 or fewer, my problem goes away for some reason. But, I don't want to do that and lose some of my data. Any help?
Full trace below:
ValueError Traceback (most recent call last)
<ipython-input-11-3d4388ba92e4> in <module>
1 si = SingleImputer(
2 strategy={'FSIQ': 'pmm'}, imp_kwgs={'pmm': {'tune': 10000, 'sample':10000}})
----> 3 data_imputed_once = si.fit_transform(df2)
/om/user/agupta81/anaconda/envs/myenv/lib/python3.8/site-packages/autoimpute/imputations/dataframe/single_imputer.py in fit_transform(self, X, y)
288 X (pd.DataFrame): imputed in place or copy of original.
289 """
--> 290 return self.fit(X, y).transform(X)
/om/user/agupta81/anaconda/envs/myenv/lib/python3.8/site-packages/autoimpute/utils/checks.py in wrapper(d, *args, **kwargs)
59 err = f"Neither {d_err} nor {a_err} are of type pd.DataFrame"
60 raise TypeError(err)
---> 61 return func(d, *args, **kwargs)
62 return wrapper
63
/om/user/agupta81/anaconda/envs/myenv/lib/python3.8/site-packages/autoimpute/utils/checks.py in wrapper(d, *args, **kwargs)
124
125 # return func if no missingness violations detected, then return wrap
--> 126 return func(d, *args, **kwargs)
127 return wrapper
128
/om/user/agupta81/anaconda/envs/myenv/lib/python3.8/site-packages/autoimpute/utils/checks.py in wrapper(d, *args, **kwargs)
171 err = f"All values missing in column(s) {nc}. Should be removed."
172 raise ValueError(err)
--> 173 return func(d, *args, **kwargs)
174 return wrapper
175
/om/user/agupta81/anaconda/envs/myenv/lib/python3.8/site-packages/autoimpute/imputations/dataframe/single_imputer.py in transform(self, X, imp_ixs)
274
275 # perform imputation given the specified imputer and value for x_
--> 276 X.loc[imp_ix, column] = imputer.impute(x_)
277 return X
278
/om/user/agupta81/anaconda/envs/myenv/lib/python3.8/site-packages/autoimpute/imputations/series/pmm.py in impute(self, X)
187 # imputed values are actual y vals corresponding to nearest neighbors
188 # therefore, this is a form of "hot-deck" imputation
--> 189 y_pred_bayes = alpha_bayes + beta_bayes.dot(X.T)
190 n_ = self.neighbors
191 if X.columns.size == 1:
ValueError: shapes (9,) and (4,13) not aligned: 9 (dim 0) != 4 (dim 0)

memory error in dask when using dummy encoder

I am in the process of going to dummy encode a dask dataframe train_final[categorical_var]. However, when I run the code I get a memory error. Could this happen since dask is supposed to do it by loading data chunk by chunk.
The code is below:
from dask_ml.preprocessing import DummyEncoder
de = DummyEncoder()
train_final_cat = de.fit_transform(train_final[categorical_var])
The error:
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
<ipython-input-84-e21592c13279> in <module>
1 from dask_ml.preprocessing import DummyEncoder
2 de = DummyEncoder()
----> 3 train_final_cat = de.fit_transform(train_final[categorical_var])
~/env/lib/python3.5/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
460 if y is None:
461 # fit method of arity 1 (unsupervised transformation)
--> 462 return self.fit(X, **fit_params).transform(X)
463 else:
464 # fit method of arity 2 (supervised transformation)
~/env/lib/python3.5/site-packages/dask_ml/preprocessing/data.py in fit(self, X, y)
602
603 self.transformed_columns_ = pd.get_dummies(
--> 604 sample, drop_first=self.drop_first
605 ).columns
606 return self
~/env/lib/python3.5/site-packages/pandas/core/reshape/reshape.py in get_dummies(data, prefix, prefix_sep, dummy_na, columns, sparse, drop_first, dtype)
890 dummy = _get_dummies_1d(col[1], prefix=pre, prefix_sep=sep,
891 dummy_na=dummy_na, sparse=sparse,
--> 892 drop_first=drop_first, dtype=dtype)
893 with_dummies.append(dummy)
894 result = concat(with_dummies, axis=1)
~/env/lib/python3.5/site-packages/pandas/core/reshape/reshape.py in _get_dummies_1d(data, prefix, prefix_sep, dummy_na, sparse, drop_first, dtype)
978
979 else:
--> 980 dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0)
981
982 if not dummy_na:
~/env/lib/python3.5/site-packages/numpy/lib/twodim_base.py in eye(N, M, k, dtype, order)
184 if M is None:
185 M = N
--> 186 m = zeros((N, M), dtype=dtype, order=order)
187 if k >= M:
188 return m
MemoryError:
Would anyone be able to give me some direction in this regard
Thanks
Michael
Encoding dummy variables is a very memory intensive task, as you're creating a new column for each unique value of your categorical_column. If categorical_column is high cardinality then even a single chunk can explode in size. As well, creating dummies is not "embarrassingly parallel"; so workers can't just process each chunk independently. The workers need to communicate and replicate some data during the computation.

Pandas OneHotEncoder.fit(dataframe) returns ValueError: invalid literal for long() with base 10

I'm trying to convert a Pandas dataframe to a NumPy array to create a model with Sklearn. I'll simplify the problem here.
>>> mydf.head(10)
IdVisita
445 latam
446 NaN
447 grados
448 grados
449 eventos
450 eventos
451 Reescribe-medios-clases-online
454 postgrados
455 postgrados
456 postgrados
Name: cat1, dtype: object
>>> from sklearn import preprocessing
>>> enc = preprocessing.OneHotEncoder()
>>> enc.fit(mydf)
Traceback:
ValueError Traceback (most recent call last)
<ipython-input-74-f581ab15cbed> in <module>()
2 mydf.head(10)
3 enc = preprocessing.OneHotEncoder()
----> 4 enc.fit(mydf)
/home/dukebody/Apps/Anaconda/lib/python2.7/site-packages/sklearn/preprocessing/data.pyc in fit(self, X, y)
996 self
997 """
--> 998 self.fit_transform(X)
999 return self
1000
/home/dukebody/Apps/Anaconda/lib/python2.7/site-packages/sklearn/preprocessing/data.pyc in fit_transform(self, X, y)
1052 """
1053 return _transform_selected(X, self._fit_transform,
-> 1054 self.categorical_features, copy=True)
1055
1056 def _transform(self, X):
/home/dukebody/Apps/Anaconda/lib/python2.7/site-packages/sklearn/preprocessing/data.pyc in _transform_selected(X, transform, selected, copy)
870 """
871 if selected == "all":
--> 872 return transform(X)
873
874 X = atleast2d_or_csc(X, copy=copy)
/home/dukebody/Apps/Anaconda/lib/python2.7/site-packages/sklearn/preprocessing/data.pyc in _fit_transform(self, X)
1001 def _fit_transform(self, X):
1002 """Assumes X contains only categorical features."""
-> 1003 X = check_arrays(X, sparse_format='dense', dtype=np.int)[0]
1004 if np.any(X < 0):
1005 raise ValueError("X needs to contain only non-negative integers.")
/home/dukebody/Apps/Anaconda/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_arrays(*arrays, **options)
279 array = np.ascontiguousarray(array, dtype=dtype)
280 else:
--> 281 array = np.asarray(array, dtype=dtype)
282 if not allow_nans:
283 _assert_all_finite(array)
/home/dukebody/Apps/Anaconda/lib/python2.7/site-packages/numpy/core/numeric.pyc in asarray(a, dtype, order)
460
461 """
--> 462 return array(a, dtype, copy=False, order=order)
463
464 def asanyarray(a, dtype=None, order=None):
ValueError: invalid literal for long() with base 10: 'postgrados'
Notice IdVisita is the index here and numbers might not be all consecutive.
Any clues?
Your error here is that you are calling OneHotEncoder which from the docs
The input to this transformer should be a matrix of integers
but your df has a single column 'cat1' which is of dtype object which is in fact a String.
You should use LabelEcnoder:
In [13]:
le = preprocessing.LabelEncoder()
le.fit(df.dropna().values)
le.classes_
C:\WinPython-64bit-3.3.3.2\python-3.3.3.amd64\lib\site-packages\sklearn\preprocessing\label.py:108: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
Out[13]:
array(['Reescribe-medios-clases-online', 'eventos', 'grados', 'latam',
'postgrados'], dtype=object)
Note I had to drop the NaN row as this will introduce a mixed dtype which cannot be used for ordering e.g. float > str will not work
A simpler approach is to use DictVectorizer, which does the conversion to integer as well as the OneHotEncoding at the same step.
Using it with the argument DictVectorizer(sparse=False) allows getting a DataFrame after the fit_transform to keep working with Pandas.

Categories