failing simple groupby example from "Python for Data Analysis" text - python

I just started learning python (mostly as open source replacement for matlab using "ipython --pylab" ), going through the examples from the "Python for Data Analysis" text. On page 253, a simple example is shown using 'groupby' (passing a list of arrays). I repeat it exactly as in the text, but I get this error:
"TypeError: 'Series' objects are mutable, thus they cannot be hashed"
import pandas as pd
from pandas import DataFrame
df = DataFrame({'key1' : ['a','a','b','b','a'],'key2' : ['one','two','one','two\
','one'],'data1' : np.random.randn(5),'data2' : np.random.randn(5)})
grouped = df['data1'].groupby(df['key1'])
means = df['data1'].groupby(df['key1'],df['key2']).mean()
-----DETAILS OF TYPEERROR-------
TypeError Traceback (most recent call last)
<ipython-input-7-0412f2897849> in <module>()
----> 1 means = df['data1'].groupby(df['key1'],df['key2']).mean()
/home/joeblow/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/pandas/core/generic.pyc in groupby(self, by, axis, level, as_index, sort, group_keys, squeeze)
2725
2726 from pandas.core.groupby import groupby
-> 2727 axis = self._get_axis_number(axis)
2728 return groupby(self, by, axis=axis, level=level, as_index=as_index,
2729 sort=sort, group_keys=group_keys, squeeze=squeeze)
/home/joeblow/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_axis_number(self, axis)
283
284 def _get_axis_number(self, axis):
--> 285 axis = self._AXIS_ALIASES.get(axis, axis)
286 if com.is_integer(axis):
287 if axis in self._AXIS_NAMES:
/home/joeblow/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/pandas/core/generic.pyc in __hash__(self)
639 def __hash__(self):
640 raise TypeError('{0!r} objects are mutable, thus they cannot be'
--> 641 ' hashed'.format(self.__class__.__name__))
642
643 def __iter__(self):
TypeError: 'Series' objects are mutable, thus they cannot be hashed
What simple thing am I missing here?

You didn't do it exactly as in the text. :^)
>>> means = df['data1'].groupby([df['key1'],df['key2']]).mean()
>>> means
key1 key2
a one 1.127536
two 1.220386
b one 0.402765
two -0.058255
dtype: float64
If you're grouping by two arrays, you need to pass a list of the arrays. You instead passed two arguments: (df['key1'],df['key2']), which are being interpreted as by and axis.

Related

Pandas: Can assign 1-column DataFrame to Series but not to DataFrame of same shape?

Python version 3.7.13, pandas version 1.3.5
I have two categorical DataFrames:
df1 = pd.DataFrame(
data={
"col1": pd.Categorical(["A", "B", "C"]),
},
index=[0, 1, 2],
)
# and df2 defined exactly the same way,
# both of which look like the following
col1
0 A
1 B
2 C
Now, if I want to assign col1 of df2 to col1 of df1, I do this:
type(df1.loc[:, "col1"])
# pandas.core.series.Series
df1.loc[:, "col1"] = df2.loc[:, "col1"]
This way both slices are actually Series, and it works as intended just fine.
If I sliced df2 to keep it a DataFrame while df1 is sliced into a Series, I still get the same result.
type(df2.loc[:, ["col1"]])
# pandas.core.frame.DataFrame
df1.loc[:, "col1"] = df2.loc[:, ["col1"]]
However, if I sliced both sides to be DataFrames, which have the same shape, it no longer works for some reason.
df1.loc[:, ["col1"]].shape == df2.loc[:, ["col1"]].shape
# True
df1.loc[:, ["col1"]] = df2.loc[:, ["col1"]]
ValueError Traceback (most recent call last)
<ipython-input-69-571ef77d04b6> in <module>()
----> 1 df1.loc[:, ["col1"]] = df2.loc[:, ["col1"]]
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in __setitem__(self, key, value)
721
722 iloc = self if self.name == "iloc" else self.obj.iloc
--> 723 iloc._setitem_with_indexer(indexer, value, self.name)
724
725 def _validate_key(self, key, axis: int):
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in _setitem_with_indexer(self, indexer, value, name)
1730 self._setitem_with_indexer_split_path(indexer, value, name)
1731 else:
-> 1732 self._setitem_single_block(indexer, value, name)
1733
1734 def _setitem_with_indexer_split_path(self, indexer, value, name: str):
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in _setitem_single_block(self, indexer, value, name)
1966
1967 # actually do the set
-> 1968 self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value)
1969 self.obj._maybe_update_cacher(clear=True)
1970
/usr/local/lib/python3.7/dist-packages/pandas/core/internals/managers.py in setitem(self, indexer, value)
353
354 def setitem(self: T, indexer, value) -> T:
--> 355 return self.apply("setitem", indexer=indexer, value=value)
356
357 def putmask(self, mask, new, align: bool = True):
/usr/local/lib/python3.7/dist-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, ignore_failures, **kwargs)
325 applied = b.apply(f, **kwargs)
326 else:
--> 327 applied = getattr(b, f)(**kwargs)
328 except (TypeError, NotImplementedError):
329 if not ignore_failures:
/usr/local/lib/python3.7/dist-packages/pandas/core/internals/blocks.py in setitem(self, indexer, value)
1489
1490 check_setitem_lengths(indexer, value, self.values)
-> 1491 self.values[indexer] = value
1492 return self
1493
/usr/local/lib/python3.7/dist-packages/pandas/core/arrays/_mixins.py in __setitem__(self, key, value)
180 def __setitem__(self, key, value):
181 key = check_array_indexer(self, key)
--> 182 value = self._validate_setitem_value(value)
183 self._ndarray[key] = value
184
/usr/local/lib/python3.7/dist-packages/pandas/core/arrays/categorical.py in _validate_setitem_value(self, value)
2043 if len(to_add) and not isna(to_add).all():
2044 raise ValueError(
-> 2045 "Cannot setitem on a Categorical with a new "
2046 "category, set the categories first"
2047 )
ValueError: Cannot setitem on a Categorical with a new category, set the categories first
I find this very confusing, because naively I would have expected the second case, assigning a DataFrame to a Series, to not work, while assigning two objects of the same type to work.
This clearly has something to do with the pd.Categorical dtype, because if both DataFrames had object or numerical dtypes, no such exception ever occurs in any way of slicing. But then again, both DataFrames have exactly identical categories and values, so I don't see why it should complain about new categories.
Also, if both DataFrames had more than 1 column, this exception never occurs either, i.e. the following works as intended:
df1 = pd.DataFrame(
data={
"col1": pd.Categorical(["A", "B", "C"]),
"col2": pd.Categorical(["C", "B", "A"]),
},
index=[0, 1, 2],
)
# df2 defined exactly the same way
df1.loc[:, ["col1", "col2"]] = df2.loc[:, ["col1", "col2"]]
Can someone explain what is special about assigning a 1-column categorical DataFrame to another please?
Edit:
It seems this issue is not consistently reproduced on different python and pandas version (None of the cases above fails in python 3.9.12 and pd 1.3.2, plus yet another scenario in comment). Then perhaps let me go one step further. Originially I ran into this exception when I wanted to do:
df1.loc[:, ["col1"]] = df2.loc[:, ["col1"]].notnull()
which fails in both of my environments. However, again, if df1 was sliced to be a Series, it somehow works:
df1.loc[:, "col1"] = df2.loc[:, ["col1"]].notnull()
I know there are workarounds to accomplish this, which is why I didn't bring up this part at first and thought there was something more fundamental to the issue regarding DataFrame and Series types.

Is there a way to automate data cleaning for pandas DataFrames?

I am cleaning my data for a machine learning project by replacing the missing values with the zeros and the mean for the 'Age' and 'Fare' columns respectively. The code for which is given below:
train_data['Age'] = train_data['Age'].fillna(0)
mean = train_data['Fare'].mean()
train_data['Fare'] = train_data['Fare'].fillna(mean)
Since I would I have to do this multiple times for other sets of data, I want to automate this process by creating a generic function that takes the DataFrame as input and performs the operations for modifying it and returning the modified function. The code for that is given below:
def data_cleaning(df):
df['Age'] = df['Age'].fillna(0)
fare_mean = df['Fare'].mean()
df['Fare'] = df['Fare'].fillna()
return df
However when I pass the training data DataFrame:
train_data = data_cleaning(train_data)
I get the following error:
/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:2:
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-
docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_42/1440633985.py in <module>
1 #print(train_data)
----> 2 train_data = data_cleaning(train_data)
3 cross_val_data = data_cleaning(cross_val_data)
/tmp/ipykernel_42/3053068338.py in data_cleaning(df)
2 df['Age'] = df['Age'].fillna(0)
3 fare_mean = df['Fare'].mean()
----> 4 df['Fare'] = df['Fare'].fillna()
5 return df
/opt/conda/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args,
**kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
/opt/conda/lib/python3.7/site-packages/pandas/core/series.py in fillna(self, value,
method, axis, inplace, limit, downcast)
4820 inplace=inplace,
4821 limit=limit,
-> 4822 downcast=downcast,
4823 )
4824
/opt/conda/lib/python3.7/site-packages/pandas/core/generic.py in fillna(self, value,
method, axis, inplace, limit, downcast)
6311 """
6312 inplace = validate_bool_kwarg(inplace, "inplace")
-> 6313 value, method = validate_fillna_kwargs(value, method)
6314
6315 self._consolidate_inplace()
/opt/conda/lib/python3.7/site-packages/pandas/util/_validators.py in
validate_fillna_kwargs(value, method, validate_scalar_dict_value)
368
369 if value is None and method is None:
--> 370 raise ValueError("Must specify a fill 'value' or 'method'.")
371 elif value is None and method is not None:
372 method = clean_fill_method(method)
ValueError: Must specify a fill 'value' or 'method'.
On some research, I found that I would have to use apply() and map() functions instead, but I am not sure how to input the mean value of the column. Furthermore, this does not scale well as I would have to calculate all the fillna values before inputting them into the function, which is cumbersome. Therefore I want to ask, is there better way to automate data cleaning?
This line df['Fare'] = df['Fare'].fillna() in your function, you did not fill the n/a with anything, thus it returns an error. You should change it to df['Fare'] = df['Fare'].fillna(fare_mean).
If you intend to make this usable for another file in same directory, you can just call it in another file by:
from file_that_contain_function import function_name
And if you intend to make it reusable for your workspace/virtual environment, you may need to create your own python package.
So yes, the other answer explains where the error is coming from.
However, the warning at the beginning has nothing to do with filling NaNs. The warning is telling you that you are modifying a slice of a copy of your dataframe. Change your code to
def data_cleaning(df):
df['Age'] = df.loc[:, 'Age'].fillna(0)
fare_mean = df['Fare'].mean()
df['Fare'] = df.loc[:, 'Fare'].fillna(fare_mean) # <- and also fix this error
return df
I suggest also searching that specific warning here, as there are hundreds of posts detailing this warning and how to deal with it. Here's a good one.

How to separate the pandas table into separate groups?

I am struggling to separate the data. I tried looking at the groupby function pandas has, but it doesn't seem to work. I don't understand what I am doing wrong.
data = pd.read_csv("path/file")
y=data['JIN-DIF']
y1=data['JEX-DIF']
y2=data['JEL-DIF']
y3=data['D3E']
d={'Induction':y,'Exchange':y1,'Dispersion':y3,'Electrostatic':y2}
df=pd.DataFrame(d)
grouped_df2= df.groupby('Exchange')
grouped_df2.filter(lambda x: x.Exchange > 0)
When I run this code, I get an "TypeError: filter function returned a Series, but expected a scalar bool error". I'm not sure about how to upload the data, so I have just attached a picture of it.
It will work when I change line 9 to
grouped_df2.filter(lambda x: x.Exchange.mean() > 0)
Here is a picture of sample data
The error message
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-71-0e26eb8f080b> in <module>
7 df=pd.DataFrame(d)
8 grouped_df2= df.groupby('Exchange')
----> 9 grouped_df2.filter(lambda x: x.Exchange > -0.1)
~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby/generic.py in filter(self, func, dropna, *args, **kwargs)
1584 # non scalars aren't allowed
1585 raise TypeError(
-> 1586 f"filter function returned a {type(res).__name__}, "
1587 "but expected a scalar bool"
1588 )
TypeError: filter function returned a Series, but expected a scalar bool

find which which row in dataframe causes groupby or transform to fail

I have a dataframe of shape (2061, 5) and the following line:
df[6] = df.groupby(df.index)[6].transform(lambda x: ' '.join(x))
..causes the following error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-19-27721ddd8064> in <module>
----> 1 df.groupby(df.index)[6].transform(lambda x: ' '.join(x))
~/.pyenv/versions/miniconda3-latest/lib/python3.7/site-packages/pandas/core/groupby/generic.py in transform(self, func, *args, **kwargs)
463
464 if not isinstance(func, str):
--> 465 return self._transform_general(func, *args, **kwargs)
466
467 elif func not in base.transform_kernel_whitelist:
~/.pyenv/versions/miniconda3-latest/lib/python3.7/site-packages/pandas/core/groupby/generic.py in _transform_general(self, func, *args, **kwargs)
487 for name, group in self:
488 object.__setattr__(group, "name", name)
--> 489 res = func(group, *args, **kwargs)
490
491 if isinstance(res, (ABCDataFrame, ABCSeries)):
<ipython-input-19-27721ddd8064> in <lambda>(x)
----> 1 df.groupby(df.index)[6].transform(lambda x: ' '.join(x))
TypeError: sequence item 0: expected str instance, float found
I developed that code on a subset of the dataframe and it seemed to be doing exactly what I wanted to the data. So now if I for example do this:
df = df.head(50)
..and run the code, the error message goes away again.
I think somewhere, a type cast is happening except at one of the lines it decides to do something else. How can I efficiently find which row in the df is causing this without manually reading through the whole two thousand long column or a trial an error thing with .head() of different sizes?
EDITED: Mask column in question to keep only rows where column has a float value, then check first index. IE:
mask = df['column_in_q'].apply(lambda x: type(x) == float)
#This returns a Boolean DF that can be used to keep only True values
float_df = df[mask] # Subset of DF that meets condition
print(df.head())
I think this is because the Groupby method returns a groupby object, not a
dataframe. You have to specify aggregation methods, which you could then subset. That is:
df[6] = df.groupby(df.index).sum()[6].transform(lambda x: ' '.join(x))
See here for more: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html

simple dask map_partitions example

I read the following SO thead and now am trying to understand it. Here is my example:
import dask.dataframe as dd
import pandas as pd
from dask.multiprocessing import get
import random
df = pd.DataFrame({'col_1':random.sample(range(10000), 10000), 'col_2': random.sample(range(10000), 10000) })
def test_f(col_1, col_2):
return col_1*col_2
ddf = dd.from_pandas(df, npartitions=8)
ddf['result'] = ddf.map_partitions(test_f, columns=['col_1', 'col_2']).compute(get=get)
It generates the following error below. What am I doing wrong? Also I am not clear how to pass additional parameters to function in map_partitions?
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
~\AppData\Local\conda\conda\envs\tensorflow\lib\site-packages\dask\dataframe\utils.py in raise_on_meta_error(funcname)
136 try:
--> 137 yield
138 except Exception as e:
~\AppData\Local\conda\conda\envs\tensorflow\lib\site-packages\dask\dataframe\core.py in _emulate(func, *args, **kwargs)
3130 with raise_on_meta_error(funcname(func)):
-> 3131 return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
3132
TypeError: test_f() got an unexpected keyword argument 'columns'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-9-913789c7326c> in <module>()
----> 1 ddf['result'] = ddf.map_partitions(test_f, columns=['col_1', 'col_2']).compute(get=get)
~\AppData\Local\conda\conda\envs\tensorflow\lib\site-packages\dask\dataframe\core.py in map_partitions(self, func, *args, **kwargs)
469 >>> ddf.map_partitions(func).clear_divisions() # doctest: +SKIP
470 """
--> 471 return map_partitions(func, self, *args, **kwargs)
472
473 #insert_meta_param_description(pad=12)
~\AppData\Local\conda\conda\envs\tensorflow\lib\site-packages\dask\dataframe\core.py in map_partitions(func, *args, **kwargs)
3163
3164 if meta is no_default:
-> 3165 meta = _emulate(func, *args, **kwargs)
3166
3167 if all(isinstance(arg, Scalar) for arg in args):
~\AppData\Local\conda\conda\envs\tensorflow\lib\site-packages\dask\dataframe\core.py in _emulate(func, *args, **kwargs)
3129 """
3130 with raise_on_meta_error(funcname(func)):
-> 3131 return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
3132
3133
~\AppData\Local\conda\conda\envs\tensorflow\lib\contextlib.py in __exit__(self, type, value, traceback)
75 value = type()
76 try:
---> 77 self.gen.throw(type, value, traceback)
78 except StopIteration as exc:
79 # Suppress StopIteration *unless* it's the same exception that
~\AppData\Local\conda\conda\envs\tensorflow\lib\site-packages\dask\dataframe\utils.py in raise_on_meta_error(funcname)
148 ).format(" in `{0}`".format(funcname) if funcname else "",
149 repr(e), tb)
--> 150 raise ValueError(msg)
151
152
ValueError: Metadata inference failed in `test_f`.
Original error is below:
------------------------
TypeError("test_f() got an unexpected keyword argument 'columns'",)
Traceback:
---------
File "C:\Users\some_user\AppData\Local\conda\conda\envs\tensorflow\lib\site-packages\dask\dataframe\utils.py", line 137, in raise_on_meta_error
yield
File "C:\Users\some_user\AppData\Local\conda\conda\envs\tensorflow\lib\site-packages\dask\dataframe\core.py", line 3131, in _emulate
return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
There is an example in map_partitions docs to achieve exactly what are trying to do:
ddf.map_partitions(lambda df: df.assign(z=df.x * df.y))
When you call map_partitions (just like when you call .apply() on pandas.DataFrame), the function that you try to map (or apply) will be given dataframe as a first argument.
In case of dask.dataframe.map_partitions this first argument will be a partition and in case of pandas.DataFrame.apply - a whole dataframe.
Which means that your function has to accept dataframe(partition) as a first argument and and in your case could look like this:
def test_f(df, col_1, col_2):
return df.assign(result=df[col_1] * df[col_2])
Note that assignment of a new column in this case happens (i.e. gets scheduled to happen) BEFORE you call .compute().
In your example you assign column AFTER you call .compute(), which kind of defeats the purpose of using dask. I.e. after you call .compute() the results of that operation are loaded into memory if there is enough space for those results (if not you just get MemoryError).
So for you example to work you could:
1) Use function (with column names as arguments):
def test_f(df, col_1, col_2):
return df.assign(result=df[col_1] * df[col_2])
ddf_out = ddf.map_partitions(test_f, 'col_1', 'col_2')
# Here is good place to do something with BIG ddf_out dataframe before calling .compute()
result = ddf_out.compute(get=get) # Will load the whole dataframe into memory
2) Use lambda (with column names hardcoded in the function):
ddf_out = ddf.map_partitions(lambda df: df.assign(result=df.col_1 * df.col_2))
# Here is good place to do something with BIG ddf_out dataframe before calling .compute()
result = ddf_out.compute(get=get) # Will load the whole dataframe into memory
Update:
To apply function on a row-by-row basis, here is a quote from the post you linked:
map / apply
You can map a function row-wise across a series with map
df.mycolumn.map(func)
You can map a function row-wise across a dataframe with apply
df.apply(func, axis=1)
I.e. for the example function in your question, it might look like this:
def test_f(dds, col_1, col_2):
return dds[col_1] * dds[col_2]
Since you will be applying it on a row-by-row basis the function's first argument will be a series (i.e. each row of a dataframe is a series).
To apply this function then you might call it like this:
dds_out = ddf.apply(
test_f,
args=('col_1', 'col_2'),
axis=1,
meta=('result', int)
).compute(get=get)
This will return a series named 'result'.
I guess you could also call .apply on each partition with a function but it does not look to be any more efficient then calling .apply on dataframe directly. But may be your tests will prove otherwise.
Your test_f takes two arguments: col_1 and col_2. You pass a single argument, ddf.
Try something like
In [5]: dd.map_partitions(test_f, ddf['col_1'], ddf['col_2'])
Out[5]:
Dask Series Structure:
npartitions=8
0 int64
1250 ...
...
8750 ...
9999 ...
dtype: int64
Dask Name: test_f, 32 tasks

Categories