Using Python, I am struggling to merge 208 CSV files into one dataframe. (My files names are Customer_1.csv, Customer_2.csv,,, and Customer_208.csv)
Following are my codes,
%matplotlib inline
import pandas as pd
df_merged = pd.concat([pd.read_csv('data_TVV1/Customer_{0}.csv'.format(i), names = ['Time', 'Energy_{0}'.format(i)], parse_dates=['Time'], index_col=['Time'], skiprows=1) for i in range(1, 209)], axis=1)
I got an error saying,
InvalidIndexError Traceback (most recent call last)
<ipython-input-4-a4d19b3c2a3e> in <module>()
----> 1 df_merged = pd.concat([pd.read_csv('data_TVV1/Customer_{0}.csv'.format(i), names = ['Time', 'Energy_{0}'.format(i)], parse_dates=['Time'], index_col=['Time'], skiprows=1) for i in range(1, 209)], axis=1)
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/tools/merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, copy)
752 keys=keys, levels=levels, names=names,
753 verify_integrity=verify_integrity,
--> 754 copy=copy)
755 return op.get_result()
756
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/tools/merge.pyc in __init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index, verify_integrity, copy)
884 self.copy = copy
885
--> 886 self.new_axes = self._get_new_axes()
887
888 def get_result(self):
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_new_axes(self)
944 if i == self.axis:
945 continue
--> 946 new_axes[i] = self._get_comb_axis(i)
947 else:
948 if len(self.join_axes) != ndim - 1:
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_comb_axis(self, i)
970 raise TypeError("Cannot concatenate list of %s" % types)
971
--> 972 return _get_combined_index(all_indexes, intersect=self.intersect)
973
974 def _get_concat_axis(self):
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/core/index.pyc in _get_combined_index(indexes, intersect)
5730 index = index.intersection(other)
5731 return index
-> 5732 union = _union_indexes(indexes)
5733 return _ensure_index(union)
5734
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/core/index.pyc in _union_indexes(indexes)
5759
5760 if hasattr(result, 'union_many'):
-> 5761 return result.union_many(indexes[1:])
5762 else:
5763 for other in indexes[1:]:
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/tseries/index.pyc in union_many(self, others)
847 else:
848 tz = this.tz
--> 849 this = Index.union(this, other)
850 if isinstance(this, DatetimeIndex):
851 this.tz = tz
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/core/index.pyc in union(self, other)
1400 result.extend([x for x in other.values if x not in value_set])
1401 else:
-> 1402 indexer = self.get_indexer(other)
1403 indexer, = (indexer == -1).nonzero()
1404
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/core/index.pyc in get_indexer(self, target, method, limit)
1685
1686 if not self.is_unique:
-> 1687 raise InvalidIndexError('Reindexing only valid with uniquely'
1688 ' valued Index objects')
1689
InvalidIndexError: Reindexing only valid with uniquely valued Index objects
Do you have any idea to solve this problem???..
Your code works on a small sample of five files that I used for testing (each file containing two columns and three row). ONLY FOR DEBUGGING, try to write this in a for loop. First, before the loop, read all of the files into the list. Then loop again and append each one using a try/except block to catch the errors. Finally, print the problem files and investigate.
# First, read all the files into a list.
files_in = [pd.read_csv('data_TVV1/Customer_{0}.csv'.format(i),
names = ['Time', 'Energy_{0}'.format(i)],
parse_dates=['Time'],
index_col=['Time'],
skiprows=1)
for i in range(1, 209)]
df = pd.DataFrame()
errors = []
# Try to append each file to the dataframe.
for i i range(1, 209):
try:
df = pd.concat([df, files_in[i - 1]], axis=1)
except:
errors.append(i)
# Print files containing errors.
for error in errors:
print(files_in[error])
Related
df = pd.DataFrame({'user_message':messages, 'message_date':dates })
# convert message_date type
df['message_date'] = pd.to_datetime(df['message_date'], format='%d/%m/%Y, %H:%M - ')
df.rename(columns={'message_date': 'date'}, inplace=True)
df.head()
ValueError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_2028\475687816.py in <module>
----> 1 df = pd.DataFrame({'user_message':messages, 'message_date':dates })
2
3 # convert message_date type
4
5 df['message_date'] = pd.to_datetime(df['message_date'], format='%d/%m/%Y, %H:%M - ')
~\anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
634 elif isinstance(data, dict):
635 # GH#38939 de facto copy defaults to False only in non-dict cases
--> 636 mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
637 elif isinstance(data, ma.MaskedArray):
638 import numpy.ma.mrecords as mrecords
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in dict_to_mgr(data, index, columns, dtype, typ, copy)
500 # TODO: can we get rid of the dt64tz special case above?
501
--> 502 return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
503
504
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in arrays_to_mgr(arrays, columns, index, dtype, verify_integrity, typ, consolidate)
118 # figure out the index, if necessary
119 if index is None:
--> 120 index = _extract_index(arrays)
121 else:
122 index = ensure_index(index)
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in _extract_index(data)
672 lengths = list(set(raw_lengths))
673 if len(lengths) > 1:
--> 674 raise ValueError("All arrays must be of the same length")
675
676 if have_dicts:
ValueError: All arrays must be of the same length
Trying to create a pandas dataframe where 2 columns will be created by the names user_message and message_date.
Also in the next line i've tried to convert message_date(column) into date time in the defined format.
It appears that your lists messages and dates in this line df = pd.DataFrame({'user_message':messages, 'message_date':dates }) are not the same length, you may want to verify that they are the same length.
a = np.random.standard_normal((9,4))
dg = pd.DataFrame(a)
dg.columns = [["No1", "No2", "No3", "No4"]]
dg["No1"]
Hello all. I have been using JupyterLab opened through Anaconda Navigator and I wrote the above code. The first three lines look normal, however, for the fourth line I was given an error as below. If I change the fourth line into dg[["No1"]] then it "worked". However, in that case type(dg[["No1"]]) is actually dataframe, not series.
I am a bit noob and I have scratched my head for almost two hours and still don't understand what's wrong. Can somebody help? Thanks!!!
TypeError Traceback (most recent call last)
<ipython-input-393-b26f43cf53bf> in <module>
----> 1 dg["No1"]
~\anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2774 if self.columns.nlevels > 1:
2775 return self._getitem_multilevel(key)
-> 2776 return self._get_item_cache(key)
2777
2778 # Do we have a slicer (on rows)?
~\anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
3584 res = cache.get(item)
3585 if res is None:
-> 3586 values = self._data.get(item)
3587 res = self._box_item_values(item, values)
3588 cache[item] = res
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in get(self, item)
966 raise ValueError("cannot label index with a null key")
967
--> 968 return self.iget(loc)
969 else:
970
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in iget(self, i)
983 Otherwise return as a ndarray
984 """
--> 985 block = self.blocks[self._blknos[i]]
986 values = block.iget(self._blklocs[i])
987
TypeError: only integer scalar arrays can be converted to a scalar index
You can just do this, unless you want multi-index :
dg.columns = ["No1", "No2", "No3", "No4"]
I have two tuples which i like to have in one but i always get a lot of NaN's when I try to concat, merge or join them.
my tuple row looks like this :
and my tuple row_ges looks like this (shape: 5 rows, 6 columns):
I would like to have this in one dataframe like this shape (5 rows,301 columns):
i tried
result=row_ges+row
result_1=row_ges.append(row,ignore_index=True)
result_2 = pd.concat([row_ges,row], axis=1, ignore_index=True)
but i always get a shape like (10rows, 301 columns) and for result_2 i get an error code i don't understand:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-209-cdc656a8e828> in <module>()
4 result=row_ges+row
5 result_1=row_ges.append(row,ignore_index=True)
----> 6 result_2 = pd.concat([row_ges,row], axis=1, ignore_index=True)
7
8 #gesamt = pd.DataFrame()
/usr/local/lib/python3.6/site-packages/pandas/core/reshape/concat.py in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, sort, copy)
224 verify_integrity=verify_integrity,
225 copy=copy, sort=sort)
--> 226 return op.get_result()
227
228
/usr/local/lib/python3.6/site-packages/pandas/core/reshape/concat.py in get_result(self)
421 new_data = concatenate_block_managers(
422 mgrs_indexers, self.new_axes, concat_axis=self.axis,
--> 423 copy=self.copy)
424 if not self.copy:
425 new_data._consolidate_inplace()
/usr/local/lib/python3.6/site-packages/pandas/core/internals.py in concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy)
5423 blocks.append(b)
5424
-> 5425 return BlockManager(blocks, axes)
5426
5427
/usr/local/lib/python3.6/site-packages/pandas/core/internals.py in __init__(self, blocks, axes, do_integrity_check)
3280
3281 if do_integrity_check:
-> 3282 self._verify_integrity()
3283
3284 self._consolidate_check()
/usr/local/lib/python3.6/site-packages/pandas/core/internals.py in _verify_integrity(self)
3491 for block in self.blocks:
3492 if block._verify_integrity and block.shape[1:] != mgr_shape[1:]:
-> 3493 construction_error(tot_items, block.shape[1:], self.axes)
3494 if len(self.items) != tot_items:
3495 raise AssertionError('Number of manager items must equal union of '
/usr/local/lib/python3.6/site-packages/pandas/core/internals.py in construction_error(tot_items, block_shape, axes, e)
4841 raise ValueError("Empty data passed with indices specified.")
4842 raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
-> 4843 passed, implied))
4844
4845
ValueError: Shape of passed values is (301, 29), indices imply (301, 9)
row.reset_index(drop=True, inplace=True)
row_ges.reset_index(drop=True, inplace=True)
result = pd.concat([row_ges,row], axis=1)
The problem was the different index. With this code it is working.
This question already has answers here:
Pandas pivot table ValueError: Index contains duplicate entries, cannot reshape
(2 answers)
Closed 4 years ago.
I want to plot a heatmap between hashtags and username from the given final table after cleaning and pre-processing.
Getting the following error.
I have pasted the full error which I'm getting I searched on similar StackOverflow errors but was unable to get the correct result.
final_sns = final.pivot("hashtags", "username")
ax = sns.heatmap(final_sns)
ValueError Traceback (most recent call last)
<ipython-input-51-277e0506604d> in <module>()
----> 1 final_sns = final.pivot("hashtags", "username")
2 ax = sns.heatmap(final_sns)
c:\users\apex_predator\appdata\local\programs\python\python36\lib\site-packages\pandas\core\frame.py in pivot(self, index, columns, values)
5192 """
5193 from pandas.core.reshape.reshape import pivot
-> 5194 return pivot(self, index=index, columns=columns, values=values)
5195
5196 _shared_docs['pivot_table'] = """
c:\users\apex_predator\appdata\local\programs\python\python36\lib\site-packages\pandas\core\reshape\reshape.py in pivot(self, index, columns, values)
413 indexed = self._constructor_sliced(self[values].values,
414 index=index)
--> 415 return indexed.unstack(columns)
416
417
c:\users\apex_predator\appdata\local\programs\python\python36\lib\site-packages\pandas\core\frame.py in unstack(self, level, fill_value)
5532 """
5533 from pandas.core.reshape.reshape import unstack
-> 5534 return unstack(self, level, fill_value)
5535
5536 _shared_docs['melt'] = ("""
c:\users\apex_predator\appdata\local\programs\python\python36\lib\site-packages\pandas\core\reshape\reshape.py in unstack(obj, level, fill_value)
493 if isinstance(obj, DataFrame):
494 if isinstance(obj.index, MultiIndex):
--> 495 return _unstack_frame(obj, level, fill_value=fill_value)
496 else:
497 return obj.T.stack(dropna=False)
c:\users\apex_predator\appdata\local\programs\python\python36\lib\site-packages\pandas\core\reshape\reshape.py in _unstack_frame(obj, level, fill_value)
507 unstacker = partial(_Unstacker, index=obj.index,
508 level=level, fill_value=fill_value)
--> 509 blocks = obj._data.unstack(unstacker)
510 return obj._constructor(blocks)
511 else:
c:\users\apex_predator\appdata\local\programs\python\python36\lib\site-packages\pandas\core\internals.py in unstack(self, unstacker_func)
4608 unstacked : BlockManager
4609 """
-> 4610 dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items)
4611 new_columns = dummy.get_new_columns()
4612 new_index = dummy.get_new_index()
c:\users\apex_predator\appdata\local\programs\python\python36\lib\site-packages\pandas\core\reshape\reshape.py in __init__(self, values, index, level, value_columns, fill_value, constructor)
135
136 self._make_sorted_values_labels()
--> 137 self._make_selectors()
138
139 def _make_sorted_values_labels(self):
c:\users\apex_predator\appdata\local\programs\python\python36\lib\site-packages\pandas\core\reshape\reshape.py in _make_selectors(self)
173
174 if mask.sum() < len(self.index):
--> 175 raise ValueError('Index contains duplicate entries, '
176 'cannot reshape')
177
ValueError: Index contains duplicate entries, cannot reshape
What is that I am missing?
Seems like you have duplicated rows in your DataFrame, so your Pivot doesn't know which row to to take while pivoting.
Try this duplicated method to check them.
I have been stuck for few days on a perhaps easy problem with python (I am a new user). I will report here a simplified version of the issue, considering a very small dataframe (df). In the simplified world the codes work, while with the big df normal operations, like slicing df by column, do not work anymore.
1) Consider a (5x2) df:
df = pd.DataFrame({'a': [1432, 1432, 1433, 1432, 1434],
'b': ['ab152', 'ab153', 'ab154', np.nan, 'ab156']})
df2 = pd.get_dummies(df.b, sparse=True)
type(df2)
[out] pandas.sparse.frame.SparseDataFrame
df2['a'] = df.a
df2 = df2.groupby('a').apply(max)[df2.columns[:-1]].to_sparse()
all works fine here. In plain text, I'd like to create a dummy matrix according to a specific column and then remove duplicates in the index by using, in this case, a max function (other functions could be used according to the purpose). 'Sparse' is necessary for memory efficiency reasons (the number of zeros is relatively very high).
Moreover, if I want to extract column 'b', I just write
df['b']
and it works.
2) In my more complex case I have roughly 5 million rows and 3 thousands cols. I apply the same set of codes.
dummy_matrix = pd.get_dummies(big_df.b, sparse=True)
type(dummy_matrix)
[out] pandas.sparse.frame.SparseDataFrame
dummy_matrix['a'] = big_df.a
dummy_matrix = dummy_matrix.groupby('a').apply(max)[dummy_matrix.columns[:-1]].to_sparse()
But the last line of code never ends and does not provide any error message.
Moreover, If I want to select column 'b' in this case I get an error as in the following:
In [81]: dummy_matrix['b']
Out[81]: ---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/usr/local/lib/python2.7/dist-packages/IPython/core/formatters.pyc in __call__(self, obj)
688 type_pprinters=self.type_printers,
689 deferred_pprinters=self.deferred_printers)
--> 690 printer.pretty(obj)
691 printer.flush()
692 return stream.getvalue()
/usr/local/lib/python2.7/dist-packages/IPython/lib/pretty.pyc in pretty(self, obj)
407 if callable(meth):
408 return meth(obj, self, cycle)
--> 409 return _default_pprint(obj, self, cycle)
410 finally:
411 self.end_group()
/usr/local/lib/python2.7/dist-packages/IPython/lib/pretty.pyc in _default_pprint(obj, p, cycle)
527 if _safe_getattr(klass, '__repr__', None) not in _baseclass_reprs:
528 # A user-provided repr. Find newlines and replace them with
p.break_()
--> 529 _repr_pprint(obj, p, cycle)
530 return
531 p.begin_group(1, '<')
/usr/local/lib/python2.7/dist-packages/IPython/lib/pretty.pyc in _repr_pprint(obj, p, cycle)
709 """A pprint that just redirects to the normal repr function."""
710 # Find newlines and replace them with p.break_()
--> 711 output = repr(obj)
712 for idx,output_line in enumerate(output.splitlines()):
713 if idx:
/usr/local/lib/python2.7/dist-packages/pandas/core/base.pyc in __repr__(self)
62 Yields Bytestring in Py2, Unicode String in py3.
63 """
---> 64 return str(self)
65
66
/usr/local/lib/python2.7/dist-packages/pandas/core/base.pyc in __str__(self)
42 if compat.PY3:
43 return self.__unicode__()
---> 44 return self.__bytes__()
45
46 def __bytes__(self):
/usr/local/lib/python2.7/dist-packages/pandas/core/base.pyc in __bytes__(self)
54
55 encoding = get_option("display.encoding")
---> 56 return self.__unicode__().encode(encoding, 'replace')
57
58 def __repr__(self):
/usr/local/lib/python2.7/dist-packages/pandas/sparse/series.pyc in __unicode__(self)
290 def __unicode__(self):
291 # currently, unicode is same as repr...fixes infinite loop
--> 292 series_rep = Series.__unicode__(self)
293 rep = '%s\n%s' % (series_rep, repr(self.sp_index))
294 return rep
/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in __unicode__(self)
897
898 self.to_string(buf=buf, name=self.name, dtype=self.dtype,
--> 899 max_rows=max_rows)
900 result = buf.getvalue()
901
/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in to_string(self, buf, na_rep, float_format, header, length, dtype, name, max_rows)
962 the_repr = self._get_repr(float_format=float_format,
na_rep=na_rep,
963 header=header, length=length, dtype=dtype,
--> 964 name=name, max_rows=max_rows)
965
966 # catch contract violations
/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in _get_repr(self, name, header, length, dtype, na_rep, float_format, max_rows)
991 na_rep=na_rep,
992 float_format=float_format,
--> 993 max_rows=max_rows)
994 result = formatter.to_string()
995
/usr/local/lib/python2.7/dist-packages/pandas/core/format.pyc in __init__(self, series, buf, length, header, na_rep, name, float_format, dtype, max_rows)
146 self.dtype = dtype
147
--> 148 self._chk_truncate()
149
150 def _chk_truncate(self):
/usr/local/lib/python2.7/dist-packages/pandas/core/format.pyc in _chk_truncate(self)
159 else:
160 row_num = max_rows // 2
--> 161 series = concat((series.iloc[:row_num], series.iloc[-row_num:]))
162 self.tr_row_num = row_num
163 self.tr_series = series
/usr/local/lib/python2.7/dist-packages/pandas/tools/merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, copy)
752 keys=keys, levels=levels, names=names,
753 verify_integrity=verify_integrity,
--> 754 copy=copy)
755 return op.get_result()
756
/usr/local/lib/python2.7/dist-packages/pandas/tools/merge.pyc in __init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index,
verify_integrity, copy)
803 for obj in objs:
804 if not isinstance(obj, NDFrame):
--> 805 raise TypeError("cannot concatenate a non-NDFrame
object")
806
807 # consolidate
TypeError: cannot concatenate a non-NDFrame object
What is the difference between the simpler and the more complex case? Why in one case the code works, while in the other it does not? Could it be related to dtypes? I checked in both case and dtypes are the same for each col so I don't think the issue resides there. Moreover, do you think the two issues, i.e. list comprehension problem and never ending comoutation, are related? I hope yes -> 1 solution for two problems.
Your help would be very appreciated and I am willing to give more details if necessary. Many thanks.