Pivot: ValueError: Index contains duplicate entries, cannot reshape [duplicate] - python

This question already has answers here:
Pandas pivot table ValueError: Index contains duplicate entries, cannot reshape
(2 answers)
Closed 4 years ago.
I want to plot a heatmap between hashtags and username from the given final table after cleaning and pre-processing.
Getting the following error.
I have pasted the full error which I'm getting I searched on similar StackOverflow errors but was unable to get the correct result.
final_sns = final.pivot("hashtags", "username")
ax = sns.heatmap(final_sns)
ValueError Traceback (most recent call last)
<ipython-input-51-277e0506604d> in <module>()
----> 1 final_sns = final.pivot("hashtags", "username")
2 ax = sns.heatmap(final_sns)
c:\users\apex_predator\appdata\local\programs\python\python36\lib\site-packages\pandas\core\frame.py in pivot(self, index, columns, values)
5192 """
5193 from pandas.core.reshape.reshape import pivot
-> 5194 return pivot(self, index=index, columns=columns, values=values)
5195
5196 _shared_docs['pivot_table'] = """
c:\users\apex_predator\appdata\local\programs\python\python36\lib\site-packages\pandas\core\reshape\reshape.py in pivot(self, index, columns, values)
413 indexed = self._constructor_sliced(self[values].values,
414 index=index)
--> 415 return indexed.unstack(columns)
416
417
c:\users\apex_predator\appdata\local\programs\python\python36\lib\site-packages\pandas\core\frame.py in unstack(self, level, fill_value)
5532 """
5533 from pandas.core.reshape.reshape import unstack
-> 5534 return unstack(self, level, fill_value)
5535
5536 _shared_docs['melt'] = ("""
c:\users\apex_predator\appdata\local\programs\python\python36\lib\site-packages\pandas\core\reshape\reshape.py in unstack(obj, level, fill_value)
493 if isinstance(obj, DataFrame):
494 if isinstance(obj.index, MultiIndex):
--> 495 return _unstack_frame(obj, level, fill_value=fill_value)
496 else:
497 return obj.T.stack(dropna=False)
c:\users\apex_predator\appdata\local\programs\python\python36\lib\site-packages\pandas\core\reshape\reshape.py in _unstack_frame(obj, level, fill_value)
507 unstacker = partial(_Unstacker, index=obj.index,
508 level=level, fill_value=fill_value)
--> 509 blocks = obj._data.unstack(unstacker)
510 return obj._constructor(blocks)
511 else:
c:\users\apex_predator\appdata\local\programs\python\python36\lib\site-packages\pandas\core\internals.py in unstack(self, unstacker_func)
4608 unstacked : BlockManager
4609 """
-> 4610 dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items)
4611 new_columns = dummy.get_new_columns()
4612 new_index = dummy.get_new_index()
c:\users\apex_predator\appdata\local\programs\python\python36\lib\site-packages\pandas\core\reshape\reshape.py in __init__(self, values, index, level, value_columns, fill_value, constructor)
135
136 self._make_sorted_values_labels()
--> 137 self._make_selectors()
138
139 def _make_sorted_values_labels(self):
c:\users\apex_predator\appdata\local\programs\python\python36\lib\site-packages\pandas\core\reshape\reshape.py in _make_selectors(self)
173
174 if mask.sum() < len(self.index):
--> 175 raise ValueError('Index contains duplicate entries, '
176 'cannot reshape')
177
ValueError: Index contains duplicate entries, cannot reshape
What is that I am missing?

Seems like you have duplicated rows in your DataFrame, so your Pivot doesn't know which row to to take while pivoting.
Try this duplicated method to check them.

Related

getting the following error = ValueError: All arrays must be of the same length

df = pd.DataFrame({'user_message':messages, 'message_date':dates })
# convert message_date type
df['message_date'] = pd.to_datetime(df['message_date'], format='%d/%m/%Y, %H:%M - ')
df.rename(columns={'message_date': 'date'}, inplace=True)
df.head()
ValueError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_2028\475687816.py in <module>
----> 1 df = pd.DataFrame({'user_message':messages, 'message_date':dates })
2
3 # convert message_date type
4
5 df['message_date'] = pd.to_datetime(df['message_date'], format='%d/%m/%Y, %H:%M - ')
~\anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
634 elif isinstance(data, dict):
635 # GH#38939 de facto copy defaults to False only in non-dict cases
--> 636 mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
637 elif isinstance(data, ma.MaskedArray):
638 import numpy.ma.mrecords as mrecords
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in dict_to_mgr(data, index, columns, dtype, typ, copy)
500 # TODO: can we get rid of the dt64tz special case above?
501
--> 502 return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
503
504
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in arrays_to_mgr(arrays, columns, index, dtype, verify_integrity, typ, consolidate)
118 # figure out the index, if necessary
119 if index is None:
--> 120 index = _extract_index(arrays)
121 else:
122 index = ensure_index(index)
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in _extract_index(data)
672 lengths = list(set(raw_lengths))
673 if len(lengths) > 1:
--> 674 raise ValueError("All arrays must be of the same length")
675
676 if have_dicts:
ValueError: All arrays must be of the same length
Trying to create a pandas dataframe where 2 columns will be created by the names user_message and message_date.
Also in the next line i've tried to convert message_date(column) into date time in the defined format.
It appears that your lists messages and dates in this line df = pd.DataFrame({'user_message':messages, 'message_date':dates }) are not the same length, you may want to verify that they are the same length.

DataError: No numeric types to aggregate pandas pivot

I have a pandas dataframe like this:
User-Id Training-Id TrainingTaken
0 4327024 25 10
1 6662572 3 10
2 3757520 26 10
and I need to convert it to a Matrix like they do here:
https://github.com/tr1ten/Anime-Recommender-System/blob/main/HybridRecommenderSystem.ipynb
Cell 13.
So I did the following:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas_profiling
from scipy.sparse import csr_matrix
from lightfm.evaluation import auc_score
from lightfm.data import Dataset
user_training_interaction = pd.pivot_table(trainingtaken, index='User-Id', columns='Training-Id', values='TrainingTaken')
user_training_interaction.fillna(0,inplace=True)
user_training_csr = csr_matrix(user_training_interaction.values)
But I get this error:
---------------------------------------------------------------------------
DataError Traceback (most recent call last)
<ipython-input-96-5a2c7ba28976> in <module>
10 from lightfm.data import Dataset
11
---> 12 user_training_interaction = pd.pivot_table(trainingtaken, index='User-Id', columns='Training-Id', values='TrainingTaken')
13 user_training_interaction.fillna(0,inplace=True)
14 user_training_csr = csr_matrix(user_training_interaction.values)
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/reshape/pivot.py in pivot_table(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed)
110
111 grouped = data.groupby(keys, observed=observed)
--> 112 agged = grouped.agg(aggfunc)
113 if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
114 agged = agged.dropna(how="all")
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/groupby/generic.py in aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
949 func = maybe_mangle_lambdas(func)
950
--> 951 result, how = self._aggregate(func, *args, **kwargs)
952 if how is None:
953 return result
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/base.py in _aggregate(self, arg, *args, **kwargs)
305
306 if isinstance(arg, str):
--> 307 return self._try_aggregate_string_function(arg, *args, **kwargs), None
308
309 if isinstance(arg, dict):
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/base.py in _try_aggregate_string_function(self, arg, *args, **kwargs)
261 if f is not None:
262 if callable(f):
--> 263 return f(*args, **kwargs)
264
265 # people may try to aggregate on a non-callable attribute
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in mean(self, numeric_only)
1396 "mean",
1397 alt=lambda x, axis: Series(x).mean(numeric_only=numeric_only),
-> 1398 numeric_only=numeric_only,
1399 )
1400
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/groupby/generic.py in _cython_agg_general(self, how, alt, numeric_only, min_count)
1020 ) -> DataFrame:
1021 agg_blocks, agg_items = self._cython_agg_blocks(
-> 1022 how, alt=alt, numeric_only=numeric_only, min_count=min_count
1023 )
1024 return self._wrap_agged_blocks(agg_blocks, items=agg_items)
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/groupby/generic.py in _cython_agg_blocks(self, how, alt, numeric_only, min_count)
1128
1129 if not (agg_blocks or split_frames):
-> 1130 raise DataError("No numeric types to aggregate")
1131
1132 if split_items:
DataError: No numeric types to aggregate
What am I missing?
The Pandas Documentation states:
While pivot() provides general purpose pivoting with various data
types (strings, numerics, etc.), pandas also provides pivot_table()
for pivoting with aggregation of numeric data
Make sure the column is numeric. Without seeing how you create trainingtaken I can't provide more specific guidance. However the following may help:
Make sure you handle "empty" values in that column. The Pandas guide is a very good place to start. Pandas points out that "a column of integers with even one missing values is cast to floating-point dtype".
If working with a dataframe, the column can be cast to a specific type via your_df.your_col.astype(int) or for your example, pd.trainingtaken.astype(int)

join two tuples with different index and different shape

I have two tuples which i like to have in one but i always get a lot of NaN's when I try to concat, merge or join them.
my tuple row looks like this :
and my tuple row_ges looks like this (shape: 5 rows, 6 columns):
I would like to have this in one dataframe like this shape (5 rows,301 columns):
i tried
result=row_ges+row
result_1=row_ges.append(row,ignore_index=True)
result_2 = pd.concat([row_ges,row], axis=1, ignore_index=True)
but i always get a shape like (10rows, 301 columns) and for result_2 i get an error code i don't understand:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-209-cdc656a8e828> in <module>()
4 result=row_ges+row
5 result_1=row_ges.append(row,ignore_index=True)
----> 6 result_2 = pd.concat([row_ges,row], axis=1, ignore_index=True)
7
8 #gesamt = pd.DataFrame()
/usr/local/lib/python3.6/site-packages/pandas/core/reshape/concat.py in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, sort, copy)
224 verify_integrity=verify_integrity,
225 copy=copy, sort=sort)
--> 226 return op.get_result()
227
228
/usr/local/lib/python3.6/site-packages/pandas/core/reshape/concat.py in get_result(self)
421 new_data = concatenate_block_managers(
422 mgrs_indexers, self.new_axes, concat_axis=self.axis,
--> 423 copy=self.copy)
424 if not self.copy:
425 new_data._consolidate_inplace()
/usr/local/lib/python3.6/site-packages/pandas/core/internals.py in concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy)
5423 blocks.append(b)
5424
-> 5425 return BlockManager(blocks, axes)
5426
5427
/usr/local/lib/python3.6/site-packages/pandas/core/internals.py in __init__(self, blocks, axes, do_integrity_check)
3280
3281 if do_integrity_check:
-> 3282 self._verify_integrity()
3283
3284 self._consolidate_check()
/usr/local/lib/python3.6/site-packages/pandas/core/internals.py in _verify_integrity(self)
3491 for block in self.blocks:
3492 if block._verify_integrity and block.shape[1:] != mgr_shape[1:]:
-> 3493 construction_error(tot_items, block.shape[1:], self.axes)
3494 if len(self.items) != tot_items:
3495 raise AssertionError('Number of manager items must equal union of '
/usr/local/lib/python3.6/site-packages/pandas/core/internals.py in construction_error(tot_items, block_shape, axes, e)
4841 raise ValueError("Empty data passed with indices specified.")
4842 raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
-> 4843 passed, implied))
4844
4845
ValueError: Shape of passed values is (301, 29), indices imply (301, 9)
row.reset_index(drop=True, inplace=True)
row_ges.reset_index(drop=True, inplace=True)
result = pd.concat([row_ges,row], axis=1)
The problem was the different index. With this code it is working.

list comprehension does not work with sparse dataframe + never ending groupby and apply computation

I have been stuck for few days on a perhaps easy problem with python (I am a new user). I will report here a simplified version of the issue, considering a very small dataframe (df). In the simplified world the codes work, while with the big df normal operations, like slicing df by column, do not work anymore.
1) Consider a (5x2) df:
df = pd.DataFrame({'a': [1432, 1432, 1433, 1432, 1434],
'b': ['ab152', 'ab153', 'ab154', np.nan, 'ab156']})
df2 = pd.get_dummies(df.b, sparse=True)
type(df2)
[out] pandas.sparse.frame.SparseDataFrame
df2['a'] = df.a
df2 = df2.groupby('a').apply(max)[df2.columns[:-1]].to_sparse()
all works fine here. In plain text, I'd like to create a dummy matrix according to a specific column and then remove duplicates in the index by using, in this case, a max function (other functions could be used according to the purpose). 'Sparse' is necessary for memory efficiency reasons (the number of zeros is relatively very high).
Moreover, if I want to extract column 'b', I just write
df['b']
and it works.
2) In my more complex case I have roughly 5 million rows and 3 thousands cols. I apply the same set of codes.
dummy_matrix = pd.get_dummies(big_df.b, sparse=True)
type(dummy_matrix)
[out] pandas.sparse.frame.SparseDataFrame
dummy_matrix['a'] = big_df.a
dummy_matrix = dummy_matrix.groupby('a').apply(max)[dummy_matrix.columns[:-1]].to_sparse()
But the last line of code never ends and does not provide any error message.
Moreover, If I want to select column 'b' in this case I get an error as in the following:
In [81]: dummy_matrix['b']
Out[81]: ---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/usr/local/lib/python2.7/dist-packages/IPython/core/formatters.pyc in __call__(self, obj)
688 type_pprinters=self.type_printers,
689 deferred_pprinters=self.deferred_printers)
--> 690 printer.pretty(obj)
691 printer.flush()
692 return stream.getvalue()
/usr/local/lib/python2.7/dist-packages/IPython/lib/pretty.pyc in pretty(self, obj)
407 if callable(meth):
408 return meth(obj, self, cycle)
--> 409 return _default_pprint(obj, self, cycle)
410 finally:
411 self.end_group()
/usr/local/lib/python2.7/dist-packages/IPython/lib/pretty.pyc in _default_pprint(obj, p, cycle)
527 if _safe_getattr(klass, '__repr__', None) not in _baseclass_reprs:
528 # A user-provided repr. Find newlines and replace them with
p.break_()
--> 529 _repr_pprint(obj, p, cycle)
530 return
531 p.begin_group(1, '<')
/usr/local/lib/python2.7/dist-packages/IPython/lib/pretty.pyc in _repr_pprint(obj, p, cycle)
709 """A pprint that just redirects to the normal repr function."""
710 # Find newlines and replace them with p.break_()
--> 711 output = repr(obj)
712 for idx,output_line in enumerate(output.splitlines()):
713 if idx:
/usr/local/lib/python2.7/dist-packages/pandas/core/base.pyc in __repr__(self)
62 Yields Bytestring in Py2, Unicode String in py3.
63 """
---> 64 return str(self)
65
66
/usr/local/lib/python2.7/dist-packages/pandas/core/base.pyc in __str__(self)
42 if compat.PY3:
43 return self.__unicode__()
---> 44 return self.__bytes__()
45
46 def __bytes__(self):
/usr/local/lib/python2.7/dist-packages/pandas/core/base.pyc in __bytes__(self)
54
55 encoding = get_option("display.encoding")
---> 56 return self.__unicode__().encode(encoding, 'replace')
57
58 def __repr__(self):
/usr/local/lib/python2.7/dist-packages/pandas/sparse/series.pyc in __unicode__(self)
290 def __unicode__(self):
291 # currently, unicode is same as repr...fixes infinite loop
--> 292 series_rep = Series.__unicode__(self)
293 rep = '%s\n%s' % (series_rep, repr(self.sp_index))
294 return rep
/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in __unicode__(self)
897
898 self.to_string(buf=buf, name=self.name, dtype=self.dtype,
--> 899 max_rows=max_rows)
900 result = buf.getvalue()
901
/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in to_string(self, buf, na_rep, float_format, header, length, dtype, name, max_rows)
962 the_repr = self._get_repr(float_format=float_format,
na_rep=na_rep,
963 header=header, length=length, dtype=dtype,
--> 964 name=name, max_rows=max_rows)
965
966 # catch contract violations
/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in _get_repr(self, name, header, length, dtype, na_rep, float_format, max_rows)
991 na_rep=na_rep,
992 float_format=float_format,
--> 993 max_rows=max_rows)
994 result = formatter.to_string()
995
/usr/local/lib/python2.7/dist-packages/pandas/core/format.pyc in __init__(self, series, buf, length, header, na_rep, name, float_format, dtype, max_rows)
146 self.dtype = dtype
147
--> 148 self._chk_truncate()
149
150 def _chk_truncate(self):
/usr/local/lib/python2.7/dist-packages/pandas/core/format.pyc in _chk_truncate(self)
159 else:
160 row_num = max_rows // 2
--> 161 series = concat((series.iloc[:row_num], series.iloc[-row_num:]))
162 self.tr_row_num = row_num
163 self.tr_series = series
/usr/local/lib/python2.7/dist-packages/pandas/tools/merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, copy)
752 keys=keys, levels=levels, names=names,
753 verify_integrity=verify_integrity,
--> 754 copy=copy)
755 return op.get_result()
756
/usr/local/lib/python2.7/dist-packages/pandas/tools/merge.pyc in __init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index,
verify_integrity, copy)
803 for obj in objs:
804 if not isinstance(obj, NDFrame):
--> 805 raise TypeError("cannot concatenate a non-NDFrame
object")
806
807 # consolidate
TypeError: cannot concatenate a non-NDFrame object
What is the difference between the simpler and the more complex case? Why in one case the code works, while in the other it does not? Could it be related to dtypes? I checked in both case and dtypes are the same for each col so I don't think the issue resides there. Moreover, do you think the two issues, i.e. list comprehension problem and never ending comoutation, are related? I hope yes -> 1 solution for two problems.
Your help would be very appreciated and I am willing to give more details if necessary. Many thanks.

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

Using Python, I am struggling to merge 208 CSV files into one dataframe. (My files names are Customer_1.csv, Customer_2.csv,,, and Customer_208.csv)
Following are my codes,
%matplotlib inline
import pandas as pd
df_merged = pd.concat([pd.read_csv('data_TVV1/Customer_{0}.csv'.format(i), names = ['Time', 'Energy_{0}'.format(i)], parse_dates=['Time'], index_col=['Time'], skiprows=1) for i in range(1, 209)], axis=1)
I got an error saying,
InvalidIndexError Traceback (most recent call last)
<ipython-input-4-a4d19b3c2a3e> in <module>()
----> 1 df_merged = pd.concat([pd.read_csv('data_TVV1/Customer_{0}.csv'.format(i), names = ['Time', 'Energy_{0}'.format(i)], parse_dates=['Time'], index_col=['Time'], skiprows=1) for i in range(1, 209)], axis=1)
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/tools/merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, copy)
752 keys=keys, levels=levels, names=names,
753 verify_integrity=verify_integrity,
--> 754 copy=copy)
755 return op.get_result()
756
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/tools/merge.pyc in __init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index, verify_integrity, copy)
884 self.copy = copy
885
--> 886 self.new_axes = self._get_new_axes()
887
888 def get_result(self):
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_new_axes(self)
944 if i == self.axis:
945 continue
--> 946 new_axes[i] = self._get_comb_axis(i)
947 else:
948 if len(self.join_axes) != ndim - 1:
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_comb_axis(self, i)
970 raise TypeError("Cannot concatenate list of %s" % types)
971
--> 972 return _get_combined_index(all_indexes, intersect=self.intersect)
973
974 def _get_concat_axis(self):
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/core/index.pyc in _get_combined_index(indexes, intersect)
5730 index = index.intersection(other)
5731 return index
-> 5732 union = _union_indexes(indexes)
5733 return _ensure_index(union)
5734
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/core/index.pyc in _union_indexes(indexes)
5759
5760 if hasattr(result, 'union_many'):
-> 5761 return result.union_many(indexes[1:])
5762 else:
5763 for other in indexes[1:]:
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/tseries/index.pyc in union_many(self, others)
847 else:
848 tz = this.tz
--> 849 this = Index.union(this, other)
850 if isinstance(this, DatetimeIndex):
851 this.tz = tz
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/core/index.pyc in union(self, other)
1400 result.extend([x for x in other.values if x not in value_set])
1401 else:
-> 1402 indexer = self.get_indexer(other)
1403 indexer, = (indexer == -1).nonzero()
1404
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/core/index.pyc in get_indexer(self, target, method, limit)
1685
1686 if not self.is_unique:
-> 1687 raise InvalidIndexError('Reindexing only valid with uniquely'
1688 ' valued Index objects')
1689
InvalidIndexError: Reindexing only valid with uniquely valued Index objects
Do you have any idea to solve this problem???..
Your code works on a small sample of five files that I used for testing (each file containing two columns and three row). ONLY FOR DEBUGGING, try to write this in a for loop. First, before the loop, read all of the files into the list. Then loop again and append each one using a try/except block to catch the errors. Finally, print the problem files and investigate.
# First, read all the files into a list.
files_in = [pd.read_csv('data_TVV1/Customer_{0}.csv'.format(i),
names = ['Time', 'Energy_{0}'.format(i)],
parse_dates=['Time'],
index_col=['Time'],
skiprows=1)
for i in range(1, 209)]
df = pd.DataFrame()
errors = []
# Try to append each file to the dataframe.
for i i range(1, 209):
try:
df = pd.concat([df, files_in[i - 1]], axis=1)
except:
errors.append(i)
# Print files containing errors.
for error in errors:
print(files_in[error])

Categories