join two tuples with different index and different shape - python

I have two tuples which i like to have in one but i always get a lot of NaN's when I try to concat, merge or join them.
my tuple row looks like this :
and my tuple row_ges looks like this (shape: 5 rows, 6 columns):
I would like to have this in one dataframe like this shape (5 rows,301 columns):
i tried
result=row_ges+row
result_1=row_ges.append(row,ignore_index=True)
result_2 = pd.concat([row_ges,row], axis=1, ignore_index=True)
but i always get a shape like (10rows, 301 columns) and for result_2 i get an error code i don't understand:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-209-cdc656a8e828> in <module>()
4 result=row_ges+row
5 result_1=row_ges.append(row,ignore_index=True)
----> 6 result_2 = pd.concat([row_ges,row], axis=1, ignore_index=True)
7
8 #gesamt = pd.DataFrame()
/usr/local/lib/python3.6/site-packages/pandas/core/reshape/concat.py in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, sort, copy)
224 verify_integrity=verify_integrity,
225 copy=copy, sort=sort)
--> 226 return op.get_result()
227
228
/usr/local/lib/python3.6/site-packages/pandas/core/reshape/concat.py in get_result(self)
421 new_data = concatenate_block_managers(
422 mgrs_indexers, self.new_axes, concat_axis=self.axis,
--> 423 copy=self.copy)
424 if not self.copy:
425 new_data._consolidate_inplace()
/usr/local/lib/python3.6/site-packages/pandas/core/internals.py in concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy)
5423 blocks.append(b)
5424
-> 5425 return BlockManager(blocks, axes)
5426
5427
/usr/local/lib/python3.6/site-packages/pandas/core/internals.py in __init__(self, blocks, axes, do_integrity_check)
3280
3281 if do_integrity_check:
-> 3282 self._verify_integrity()
3283
3284 self._consolidate_check()
/usr/local/lib/python3.6/site-packages/pandas/core/internals.py in _verify_integrity(self)
3491 for block in self.blocks:
3492 if block._verify_integrity and block.shape[1:] != mgr_shape[1:]:
-> 3493 construction_error(tot_items, block.shape[1:], self.axes)
3494 if len(self.items) != tot_items:
3495 raise AssertionError('Number of manager items must equal union of '
/usr/local/lib/python3.6/site-packages/pandas/core/internals.py in construction_error(tot_items, block_shape, axes, e)
4841 raise ValueError("Empty data passed with indices specified.")
4842 raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
-> 4843 passed, implied))
4844
4845
ValueError: Shape of passed values is (301, 29), indices imply (301, 9)

row.reset_index(drop=True, inplace=True)
row_ges.reset_index(drop=True, inplace=True)
result = pd.concat([row_ges,row], axis=1)
The problem was the different index. With this code it is working.

Related

getting the following error = ValueError: All arrays must be of the same length

df = pd.DataFrame({'user_message':messages, 'message_date':dates })
# convert message_date type
df['message_date'] = pd.to_datetime(df['message_date'], format='%d/%m/%Y, %H:%M - ')
df.rename(columns={'message_date': 'date'}, inplace=True)
df.head()
ValueError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_2028\475687816.py in <module>
----> 1 df = pd.DataFrame({'user_message':messages, 'message_date':dates })
2
3 # convert message_date type
4
5 df['message_date'] = pd.to_datetime(df['message_date'], format='%d/%m/%Y, %H:%M - ')
~\anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
634 elif isinstance(data, dict):
635 # GH#38939 de facto copy defaults to False only in non-dict cases
--> 636 mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
637 elif isinstance(data, ma.MaskedArray):
638 import numpy.ma.mrecords as mrecords
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in dict_to_mgr(data, index, columns, dtype, typ, copy)
500 # TODO: can we get rid of the dt64tz special case above?
501
--> 502 return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
503
504
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in arrays_to_mgr(arrays, columns, index, dtype, verify_integrity, typ, consolidate)
118 # figure out the index, if necessary
119 if index is None:
--> 120 index = _extract_index(arrays)
121 else:
122 index = ensure_index(index)
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in _extract_index(data)
672 lengths = list(set(raw_lengths))
673 if len(lengths) > 1:
--> 674 raise ValueError("All arrays must be of the same length")
675
676 if have_dicts:
ValueError: All arrays must be of the same length
Trying to create a pandas dataframe where 2 columns will be created by the names user_message and message_date.
Also in the next line i've tried to convert message_date(column) into date time in the defined format.
It appears that your lists messages and dates in this line df = pd.DataFrame({'user_message':messages, 'message_date':dates }) are not the same length, you may want to verify that they are the same length.

Cannot Get Series From Dataframe (Python)

a = np.random.standard_normal((9,4))
dg = pd.DataFrame(a)
dg.columns = [["No1", "No2", "No3", "No4"]]
dg["No1"]
Hello all. I have been using JupyterLab opened through Anaconda Navigator and I wrote the above code. The first three lines look normal, however, for the fourth line I was given an error as below. If I change the fourth line into dg[["No1"]] then it "worked". However, in that case type(dg[["No1"]]) is actually dataframe, not series.
I am a bit noob and I have scratched my head for almost two hours and still don't understand what's wrong. Can somebody help? Thanks!!!
TypeError Traceback (most recent call last)
<ipython-input-393-b26f43cf53bf> in <module>
----> 1 dg["No1"]
~\anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2774 if self.columns.nlevels > 1:
2775 return self._getitem_multilevel(key)
-> 2776 return self._get_item_cache(key)
2777
2778 # Do we have a slicer (on rows)?
~\anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
3584 res = cache.get(item)
3585 if res is None:
-> 3586 values = self._data.get(item)
3587 res = self._box_item_values(item, values)
3588 cache[item] = res
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in get(self, item)
966 raise ValueError("cannot label index with a null key")
967
--> 968 return self.iget(loc)
969 else:
970
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in iget(self, i)
983 Otherwise return as a ndarray
984 """
--> 985 block = self.blocks[self._blknos[i]]
986 values = block.iget(self._blklocs[i])
987
TypeError: only integer scalar arrays can be converted to a scalar index
You can just do this, unless you want multi-index :
dg.columns = ["No1", "No2", "No3", "No4"]

Pivot: ValueError: Index contains duplicate entries, cannot reshape [duplicate]

This question already has answers here:
Pandas pivot table ValueError: Index contains duplicate entries, cannot reshape
(2 answers)
Closed 4 years ago.
I want to plot a heatmap between hashtags and username from the given final table after cleaning and pre-processing.
Getting the following error.
I have pasted the full error which I'm getting I searched on similar StackOverflow errors but was unable to get the correct result.
final_sns = final.pivot("hashtags", "username")
ax = sns.heatmap(final_sns)
ValueError Traceback (most recent call last)
<ipython-input-51-277e0506604d> in <module>()
----> 1 final_sns = final.pivot("hashtags", "username")
2 ax = sns.heatmap(final_sns)
c:\users\apex_predator\appdata\local\programs\python\python36\lib\site-packages\pandas\core\frame.py in pivot(self, index, columns, values)
5192 """
5193 from pandas.core.reshape.reshape import pivot
-> 5194 return pivot(self, index=index, columns=columns, values=values)
5195
5196 _shared_docs['pivot_table'] = """
c:\users\apex_predator\appdata\local\programs\python\python36\lib\site-packages\pandas\core\reshape\reshape.py in pivot(self, index, columns, values)
413 indexed = self._constructor_sliced(self[values].values,
414 index=index)
--> 415 return indexed.unstack(columns)
416
417
c:\users\apex_predator\appdata\local\programs\python\python36\lib\site-packages\pandas\core\frame.py in unstack(self, level, fill_value)
5532 """
5533 from pandas.core.reshape.reshape import unstack
-> 5534 return unstack(self, level, fill_value)
5535
5536 _shared_docs['melt'] = ("""
c:\users\apex_predator\appdata\local\programs\python\python36\lib\site-packages\pandas\core\reshape\reshape.py in unstack(obj, level, fill_value)
493 if isinstance(obj, DataFrame):
494 if isinstance(obj.index, MultiIndex):
--> 495 return _unstack_frame(obj, level, fill_value=fill_value)
496 else:
497 return obj.T.stack(dropna=False)
c:\users\apex_predator\appdata\local\programs\python\python36\lib\site-packages\pandas\core\reshape\reshape.py in _unstack_frame(obj, level, fill_value)
507 unstacker = partial(_Unstacker, index=obj.index,
508 level=level, fill_value=fill_value)
--> 509 blocks = obj._data.unstack(unstacker)
510 return obj._constructor(blocks)
511 else:
c:\users\apex_predator\appdata\local\programs\python\python36\lib\site-packages\pandas\core\internals.py in unstack(self, unstacker_func)
4608 unstacked : BlockManager
4609 """
-> 4610 dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items)
4611 new_columns = dummy.get_new_columns()
4612 new_index = dummy.get_new_index()
c:\users\apex_predator\appdata\local\programs\python\python36\lib\site-packages\pandas\core\reshape\reshape.py in __init__(self, values, index, level, value_columns, fill_value, constructor)
135
136 self._make_sorted_values_labels()
--> 137 self._make_selectors()
138
139 def _make_sorted_values_labels(self):
c:\users\apex_predator\appdata\local\programs\python\python36\lib\site-packages\pandas\core\reshape\reshape.py in _make_selectors(self)
173
174 if mask.sum() < len(self.index):
--> 175 raise ValueError('Index contains duplicate entries, '
176 'cannot reshape')
177
ValueError: Index contains duplicate entries, cannot reshape
What is that I am missing?
Seems like you have duplicated rows in your DataFrame, so your Pivot doesn't know which row to to take while pivoting.
Try this duplicated method to check them.

Python - Save dataframe to CSV "too many indices for array" error

I am trying to save a dataframe as CSV and get a "too many indices for array" error. The code used for the save is-
df.to_csv('CCS_Matrix.csv')
The dataframe looks like this
Var10 Var100 Var101
0 0 1 1
1 0 0 1
2 0 1 0
There are 250 columns and about 10 million rows in the dataset.
The dtypes for the dataframe are
Var10 int64
Var100 int64
Var101 int64
etc.
All the dtypes are the same for the 250 columns.
Here is the full output of the error message
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-16-37cbe55e6c0d> in <module>()
----> 1 df.to_csv('CCS_Matrix.csv', encoding='utf-8')
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in to_csv(self, path_or_buf, sep, na_rep, float_format, columns, header, index, index_label, mode, encoding, compression, quoting, quotechar, line_terminator, chunksize, tupleize_cols, date_format, doublequote, escapechar, decimal)
1401 doublequote=doublequote,
1402 escapechar=escapechar, decimal=decimal)
-> 1403 formatter.save()
1404
1405 if path_or_buf is None:
~/anaconda3/lib/python3.6/site-packages/pandas/io/formats/format.py in save(self)
1590 self.writer = csv.writer(f, **writer_kwargs)
1591
-> 1592 self._save()
1593
1594 finally:
~/anaconda3/lib/python3.6/site-packages/pandas/io/formats/format.py in _save(self)
1691 break
1692
-> 1693 self._save_chunk(start_i, end_i)
1694
1695 def _save_chunk(self, start_i, end_i):
~/anaconda3/lib/python3.6/site-packages/pandas/io/formats/format.py in _save_chunk(self, start_i, end_i)
1705 decimal=self.decimal,
1706 date_format=self.date_format,
-> 1707 quoting=self.quoting)
1708
1709 for col_loc, col in zip(b.mgr_locs, d):
~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in to_native_types(self, slicer, na_rep, quoting, **kwargs)
611 values = self.values
612 if slicer is not None:
--> 613 values = values[:, slicer]
614 mask = isnull(values)
615
~/anaconda3/lib/python3.6/site-packages/pandas/core/sparse/array.py in __getitem__(self, key)
417 return self._get_val_at(key)
418 elif isinstance(key, tuple):
--> 419 data_slice = self.values[key]
420 else:
421 if isinstance(key, SparseArray):
IndexError: too many indices for array
Could you print out type(df)?
I've noted this problem in SparseDataFrames here.
I was able to solve the problem by calling .to_dense() on the SparseDataFrame, yielding a traditional DataFrame. Worked fine after that. Clearly that's not ideal for memory reasons, but at least it works in the short term.
The pandas team has responded that it is indeed a bug.
you can try another option to save as csv '.toCSV('name.csv)'. That can give you a different error message like ('SparseDataFrame' object has no attribute 'toCSV')
So the problem was solved by turning dataframe to dense dataframe
df.to_dense().to_csv("submission.csv", index = False, sep=',', encoding='utf-8')

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

Using Python, I am struggling to merge 208 CSV files into one dataframe. (My files names are Customer_1.csv, Customer_2.csv,,, and Customer_208.csv)
Following are my codes,
%matplotlib inline
import pandas as pd
df_merged = pd.concat([pd.read_csv('data_TVV1/Customer_{0}.csv'.format(i), names = ['Time', 'Energy_{0}'.format(i)], parse_dates=['Time'], index_col=['Time'], skiprows=1) for i in range(1, 209)], axis=1)
I got an error saying,
InvalidIndexError Traceback (most recent call last)
<ipython-input-4-a4d19b3c2a3e> in <module>()
----> 1 df_merged = pd.concat([pd.read_csv('data_TVV1/Customer_{0}.csv'.format(i), names = ['Time', 'Energy_{0}'.format(i)], parse_dates=['Time'], index_col=['Time'], skiprows=1) for i in range(1, 209)], axis=1)
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/tools/merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, copy)
752 keys=keys, levels=levels, names=names,
753 verify_integrity=verify_integrity,
--> 754 copy=copy)
755 return op.get_result()
756
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/tools/merge.pyc in __init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index, verify_integrity, copy)
884 self.copy = copy
885
--> 886 self.new_axes = self._get_new_axes()
887
888 def get_result(self):
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_new_axes(self)
944 if i == self.axis:
945 continue
--> 946 new_axes[i] = self._get_comb_axis(i)
947 else:
948 if len(self.join_axes) != ndim - 1:
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_comb_axis(self, i)
970 raise TypeError("Cannot concatenate list of %s" % types)
971
--> 972 return _get_combined_index(all_indexes, intersect=self.intersect)
973
974 def _get_concat_axis(self):
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/core/index.pyc in _get_combined_index(indexes, intersect)
5730 index = index.intersection(other)
5731 return index
-> 5732 union = _union_indexes(indexes)
5733 return _ensure_index(union)
5734
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/core/index.pyc in _union_indexes(indexes)
5759
5760 if hasattr(result, 'union_many'):
-> 5761 return result.union_many(indexes[1:])
5762 else:
5763 for other in indexes[1:]:
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/tseries/index.pyc in union_many(self, others)
847 else:
848 tz = this.tz
--> 849 this = Index.union(this, other)
850 if isinstance(this, DatetimeIndex):
851 this.tz = tz
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/core/index.pyc in union(self, other)
1400 result.extend([x for x in other.values if x not in value_set])
1401 else:
-> 1402 indexer = self.get_indexer(other)
1403 indexer, = (indexer == -1).nonzero()
1404
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/core/index.pyc in get_indexer(self, target, method, limit)
1685
1686 if not self.is_unique:
-> 1687 raise InvalidIndexError('Reindexing only valid with uniquely'
1688 ' valued Index objects')
1689
InvalidIndexError: Reindexing only valid with uniquely valued Index objects
Do you have any idea to solve this problem???..
Your code works on a small sample of five files that I used for testing (each file containing two columns and three row). ONLY FOR DEBUGGING, try to write this in a for loop. First, before the loop, read all of the files into the list. Then loop again and append each one using a try/except block to catch the errors. Finally, print the problem files and investigate.
# First, read all the files into a list.
files_in = [pd.read_csv('data_TVV1/Customer_{0}.csv'.format(i),
names = ['Time', 'Energy_{0}'.format(i)],
parse_dates=['Time'],
index_col=['Time'],
skiprows=1)
for i in range(1, 209)]
df = pd.DataFrame()
errors = []
# Try to append each file to the dataframe.
for i i range(1, 209):
try:
df = pd.concat([df, files_in[i - 1]], axis=1)
except:
errors.append(i)
# Print files containing errors.
for error in errors:
print(files_in[error])

Categories