I am doing a tutorial online on Juypter notebook with Python and Pandas, and when I run the following code, I run into this error.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
# reading the csv file
titanic = pd.read_csv("titanic.csv")
titanic_class = titanic.groupby("Pclass")
titanic_class.get_group(1)
titanic_class.max()
AssertionError Traceback (most recent call last)
<ipython-input-26-4d1be28a55cb> in <module>
1 #max ticket fare paid
----> 2 titanic_class.max()
~\anaconda3\lib\site-packages\pandas\core\groupby\groupby.py in f(self, **kwargs)
1369 # try a cython aggregation if we can
1370 try:
-> 1371 return self._cython_agg_general(alias, alt=npfunc, **kwargs)
1372 except DataError:
1373 pass
~\anaconda3\lib\site-packages\pandas\core\groupby\generic.py in _cython_agg_general(self, how, alt, numeric_only, min_count)
992 ) -> DataFrame:
993 agg_blocks, agg_items = self._cython_agg_blocks(
--> 994 how, alt=alt, numeric_only=numeric_only, min_count=min_count
995 )
996 return self._wrap_agged_blocks(agg_blocks, items=agg_items)
~\anaconda3\lib\site-packages\pandas\core\groupby\generic.py in _cython_agg_blocks(self, how, alt, numeric_only, min_count)
1098 # Clean up the mess left over from split blocks.
1099 for locs, result in zip(split_items, split_frames):
-> 1100 assert len(locs) == result.shape[1]
1101 for i, loc in enumerate(locs):
1102 new_items.append(np.array([loc], dtype=locs.dtype))
AssertionError:
Can someone tell me what's wrong? The titanic_class.sum() & the titanic_class.mean() works without any error.
The last column of the excel file has letters. Once I removed them, the max function worked.
This happens when any column has empty (Nan) values. Try to remove those columns before using max.
Related
I have a pandas dataframe like this:
User-Id Training-Id TrainingTaken
0 4327024 25 10
1 6662572 3 10
2 3757520 26 10
and I need to convert it to a Matrix like they do here:
https://github.com/tr1ten/Anime-Recommender-System/blob/main/HybridRecommenderSystem.ipynb
Cell 13.
So I did the following:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas_profiling
from scipy.sparse import csr_matrix
from lightfm.evaluation import auc_score
from lightfm.data import Dataset
user_training_interaction = pd.pivot_table(trainingtaken, index='User-Id', columns='Training-Id', values='TrainingTaken')
user_training_interaction.fillna(0,inplace=True)
user_training_csr = csr_matrix(user_training_interaction.values)
But I get this error:
---------------------------------------------------------------------------
DataError Traceback (most recent call last)
<ipython-input-96-5a2c7ba28976> in <module>
10 from lightfm.data import Dataset
11
---> 12 user_training_interaction = pd.pivot_table(trainingtaken, index='User-Id', columns='Training-Id', values='TrainingTaken')
13 user_training_interaction.fillna(0,inplace=True)
14 user_training_csr = csr_matrix(user_training_interaction.values)
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/reshape/pivot.py in pivot_table(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed)
110
111 grouped = data.groupby(keys, observed=observed)
--> 112 agged = grouped.agg(aggfunc)
113 if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
114 agged = agged.dropna(how="all")
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/groupby/generic.py in aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
949 func = maybe_mangle_lambdas(func)
950
--> 951 result, how = self._aggregate(func, *args, **kwargs)
952 if how is None:
953 return result
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/base.py in _aggregate(self, arg, *args, **kwargs)
305
306 if isinstance(arg, str):
--> 307 return self._try_aggregate_string_function(arg, *args, **kwargs), None
308
309 if isinstance(arg, dict):
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/base.py in _try_aggregate_string_function(self, arg, *args, **kwargs)
261 if f is not None:
262 if callable(f):
--> 263 return f(*args, **kwargs)
264
265 # people may try to aggregate on a non-callable attribute
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in mean(self, numeric_only)
1396 "mean",
1397 alt=lambda x, axis: Series(x).mean(numeric_only=numeric_only),
-> 1398 numeric_only=numeric_only,
1399 )
1400
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/groupby/generic.py in _cython_agg_general(self, how, alt, numeric_only, min_count)
1020 ) -> DataFrame:
1021 agg_blocks, agg_items = self._cython_agg_blocks(
-> 1022 how, alt=alt, numeric_only=numeric_only, min_count=min_count
1023 )
1024 return self._wrap_agged_blocks(agg_blocks, items=agg_items)
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/groupby/generic.py in _cython_agg_blocks(self, how, alt, numeric_only, min_count)
1128
1129 if not (agg_blocks or split_frames):
-> 1130 raise DataError("No numeric types to aggregate")
1131
1132 if split_items:
DataError: No numeric types to aggregate
What am I missing?
The Pandas Documentation states:
While pivot() provides general purpose pivoting with various data
types (strings, numerics, etc.), pandas also provides pivot_table()
for pivoting with aggregation of numeric data
Make sure the column is numeric. Without seeing how you create trainingtaken I can't provide more specific guidance. However the following may help:
Make sure you handle "empty" values in that column. The Pandas guide is a very good place to start. Pandas points out that "a column of integers with even one missing values is cast to floating-point dtype".
If working with a dataframe, the column can be cast to a specific type via your_df.your_col.astype(int) or for your example, pd.trainingtaken.astype(int)
I am going to describe males and females groups by gender to visualize how Churn Rate (1-Retention Rate) looks like for each value.
My output
df_data.groupby(by='gender')['Churn'].mean()
error
---------------------------------------------------------------------------
DataError Traceback (most recent call last)
<ipython-input-46-75992efc6958> in <module>()
----> 1 df_data.groupby(by='gender')['Churn'].mean()
1 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/groupby/groupby.py in mean(self, numeric_only)
1396 "mean",
1397 alt=lambda x, axis: Series(x).mean(numeric_only=numeric_only),
-> 1398 numeric_only=numeric_only,
1399 )
1400
/usr/local/lib/python3.7/dist-packages/pandas/core/groupby/groupby.py in _cython_agg_general(self, how, alt, numeric_only, min_count)
1051
1052 if len(output) == 0:
-> 1053 raise DataError("No numeric types to aggregate")
1054
1055 return self._wrap_aggregated_output(output, index=self.grouper.result_index)
DataError: No numeric types to aggregate
All your columns, even those that look like numbers, are strings. You must convert "numeric" columns into numeric columns with .astype(int) before applying .mean(). For example:
df.tenure = df.tenure.astype(int)
For your precise answer
df_data.Churn = df_data.Churn.astype(int)
I am loading multiple parquet files containing timeseries data together. But the loaded dask dataframe has unknown partitions because of which I can't apply various time series operations on it.
df = dd.read_parquet('/path/to/*.parquet', index='Timestamps)
For instance, df_resampled = df.resample('1T').mean().compute() gives following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-12-8e6f7f4340fd> in <module>
1 df = dd.read_parquet('/path/to/*.parquet', index='Timestamps')
----> 2 df_resampled = df.resample('1T').mean().compute()
~/.conda/envs/suf/lib/python3.7/site-packages/dask/dataframe/core.py in resample(self, rule, closed, label)
2627 from .tseries.resample import Resampler
2628
-> 2629 return Resampler(self, rule, closed=closed, label=label)
2630
2631 #derived_from(pd.DataFrame)
~/.conda/envs/suf/lib/python3.7/site-packages/dask/dataframe/tseries/resample.py in __init__(self, obj, rule, **kwargs)
118 "for more information."
119 )
--> 120 raise ValueError(msg)
121 self.obj = obj
122 self._rule = pd.tseries.frequencies.to_offset(rule)
ValueError: Can only resample dataframes with known divisions
See https://docs.dask.org/en/latest/dataframe-design.html#partitions
for more information.
I went to the link: https://docs.dask.org/en/latest/dataframe-design.html#partitions and it says,
In these cases (when divisions are unknown), any operation that requires a cleanly partitioned DataFrame with known divisions will have to perform a sort. This can generally achieved by calling df.set_index(...).
I then tried following, but no success.
df = dd.read_parquet('/path/to/*.parquet')
df = df.set_index('Timestamps')
This step throws the following error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-4-468e9af0c4d6> in <module>
1 df = dd.read_parquet(os.path.join(OUTPUT_DATA_DIR, '20*.gzip'))
----> 2 df.set_index('Timestamps')
3 # df_resampled = df.resample('1T').mean().compute()
~/.conda/envs/suf/lib/python3.7/site-packages/dask/dataframe/core.py in set_index(***failed resolving arguments***)
3915 npartitions=npartitions,
3916 divisions=divisions,
-> 3917 **kwargs,
3918 )
3919
~/.conda/envs/suf/lib/python3.7/site-packages/dask/dataframe/shuffle.py in set_index(df, index, npartitions, shuffle, compute, drop, upsample, divisions, partition_size, **kwargs)
483 if divisions is None:
484 sizes = df.map_partitions(sizeof) if repartition else []
--> 485 divisions = index2._repartition_quantiles(npartitions, upsample=upsample)
486 mins = index2.map_partitions(M.min)
487 maxes = index2.map_partitions(M.max)
~/.conda/envs/suf/lib/python3.7/site-packages/dask/dataframe/core.py in __getattr__(self, key)
3755 return self[key]
3756 else:
-> 3757 raise AttributeError("'DataFrame' object has no attribute %r" % key)
3758
3759 def __dir__(self):
AttributeError: 'DataFrame' object has no attribute '_repartition_quantiles'
Can anybody suggest what is the right way to load multiple timeseries files as a dask dataframe on which timeseries operations of pandas can be applied?
I would like to iterate through groups in a dataframe. This is possible in pandas, but when I port this to koalas, I get an error.
import databricks.koalas as ks
import pandas as pd
pdf = pd.DataFrame({'x':range(3), 'y':['a','b','b'], 'z':['a','b','b']})
# Create a Koalas DataFrame from pandas DataFrame
df = ks.from_pandas(pdf)
for a in df.groupby('x'):
print(a)
Here is the error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-35-d4164d1f71e0> in <module>
----> 1 for a in df.groupby('x'):
2 print(a)
/opt/conda/lib/python3.7/site-packages/databricks/koalas/groupby.py in __getitem__(self, item)
2630 if self._as_index and is_name_like_value(item):
2631 return SeriesGroupBy(
-> 2632 self._kdf._kser_for(item if is_name_like_tuple(item) else (item,)),
2633 self._groupkeys,
2634 dropna=self._dropna,
/opt/conda/lib/python3.7/site-packages/databricks/koalas/frame.py in _kser_for(self, label)
721 Name: id, dtype: int64
722 """
--> 723 return self._ksers[label]
724
725 def _apply_series_op(self, op, should_resolve: bool = False):
KeyError: (0,)
Is this kind of group iteration possible in koalas? The koalas documentation kind of implies it is possible - https://koalas.readthedocs.io/en/latest/reference/groupby.html
Groupby iteration is not yet implemented:
https://github.com/databricks/koalas/issues/2014
I have a parquet file called data.parquet. I'm using the library dask from Python. When I run the line
import dask.dataframe as dd
df = dd.read_parquet('data.parquet',engine='pyarrow')
I get the error
TypeError Traceback (most recent call last)
<ipython-input-22-807fa43763c1> in <module>
----> 1 df = dd.read_parquet('data.parquet',engine='pyarrow')
~/anaconda3/lib/python3.7/site-packages/dask/dataframe/io/parquet.py in read_parquet(path, columns, filters, categories, index, storage_options, engine, infer_divisions)
1395 categories=categories,
1396 index=index,
-> 1397 infer_divisions=infer_divisions,
1398 )
1399
~/anaconda3/lib/python3.7/site-packages/dask/dataframe/io/parquet.py in _read_pyarrow(fs, fs_token, paths, columns, filters, categories, index, infer_divisions)
858 _open = lambda fn: pq.ParquetFile(fs.open(fn, mode="rb"))
859 for piece in dataset.pieces:
--> 860 pf = piece.get_metadata(_open)
861 # non_empty_pieces.append(piece)
862 if pf.num_row_groups > 0:
TypeError: get_metadata() takes 1 positional argument but 2 were given
I just don't understand why this happens, since this is how it is implemented here.
Any help will be appreciated!
I faced the same problem. I resolved by upgrade version dask 2.30.0