Cannot Get Series From Dataframe (Python) - python

a = np.random.standard_normal((9,4))
dg = pd.DataFrame(a)
dg.columns = [["No1", "No2", "No3", "No4"]]
dg["No1"]
Hello all. I have been using JupyterLab opened through Anaconda Navigator and I wrote the above code. The first three lines look normal, however, for the fourth line I was given an error as below. If I change the fourth line into dg[["No1"]] then it "worked". However, in that case type(dg[["No1"]]) is actually dataframe, not series.
I am a bit noob and I have scratched my head for almost two hours and still don't understand what's wrong. Can somebody help? Thanks!!!
TypeError Traceback (most recent call last)
<ipython-input-393-b26f43cf53bf> in <module>
----> 1 dg["No1"]
~\anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2774 if self.columns.nlevels > 1:
2775 return self._getitem_multilevel(key)
-> 2776 return self._get_item_cache(key)
2777
2778 # Do we have a slicer (on rows)?
~\anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
3584 res = cache.get(item)
3585 if res is None:
-> 3586 values = self._data.get(item)
3587 res = self._box_item_values(item, values)
3588 cache[item] = res
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in get(self, item)
966 raise ValueError("cannot label index with a null key")
967
--> 968 return self.iget(loc)
969 else:
970
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in iget(self, i)
983 Otherwise return as a ndarray
984 """
--> 985 block = self.blocks[self._blknos[i]]
986 values = block.iget(self._blklocs[i])
987
TypeError: only integer scalar arrays can be converted to a scalar index

You can just do this, unless you want multi-index :
dg.columns = ["No1", "No2", "No3", "No4"]

Related

How to build a for loop that prints the sentiment score of each string and does not produce a key error?

I have a dataset of tweets that I put into a pandas dataframe and converted each row to a string so that each row could be analysed with the my sentiment analyzer. I'm trying to print the sentiment score of each tweet using a for loop:
for row in msmarvel.Text:
print(text_sentiment(row))
It works for the first few tweets,
2.4332083615899887
3.479569526740967
2.426372867331215
2.2458306180346703
2.2478570548004133
0.9351690267777979
but then gives this error:
KeyError Traceback (most recent call last)
C:\Users\SHEHZA~1\AppData\Local\Temp/ipykernel_2420/262060431.py in <module>
3 if word not in embeddings.index:
4 continue
----> 5 print(text_sentiment(row))
C:\Users\SHEHZA~1\AppData\Local\Temp/ipykernel_2420/923749346.py in text_sentiment(text)
5 def text_sentiment(text):
6 tokens = [token.casefold() for token in TOKEN_RE.findall(text)]
----> 7 sentiments = words_sentiment(tokens)
8 return sentiments['sentiment'].mean()
C:\Users\SHEHZA~1\AppData\Local\Temp/ipykernel_2420/994030881.py in words_sentiment(words)
11
12 def words_sentiment(words):
---> 13 vecs = embeddings.loc[words].dropna() # vectors are defined by searching words (we provide) that are in the embeddings dictionary
14 log_odds = vector_sentiment(vecs) # vector sentiment is calculated by getting the log probability
15 return pd.DataFrame({'sentiment': log_odds}, index=vecs.index)
~\anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
929
930 maybe_callable = com.apply_if_callable(key, self.obj)
--> 931 return self._getitem_axis(maybe_callable, axis=axis)
932
933 def _is_scalar_access(self, key: tuple):
~\anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
1151 raise ValueError("Cannot index with multidimensional key")
1152
-> 1153 return self._getitem_iterable(key, axis=axis)
1154
1155 # nested tuple slicing
~\anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_iterable(self, key, axis)
1091
1092 # A collection of keys
-> 1093 keyarr, indexer = self._get_listlike_indexer(key, axis)
1094 return self.obj._reindex_with_indexers(
1095 {axis: [keyarr, indexer]}, copy=True, allow_dups=True
~\anaconda3\lib\site-packages\pandas\core\indexing.py in _get_listlike_indexer(self, key, axis)
1312 keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr)
1313
-> 1314 self._validate_read_indexer(keyarr, indexer, axis)
1315
1316 if needs_i8_conversion(ax.dtype) or isinstance(
~\anaconda3\lib\site-packages\pandas\core\indexing.py in _validate_read_indexer(self, key, indexer, axis)
1375
1376 not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
-> 1377 raise KeyError(f"{not_found} not in index")
1378
1379
KeyError: "['fbexclusive'] not in index"
The problem is there are words in some of the tweets (particularly slang words or grammatically incorrect words) that can't be analyzed with the sentiment analyzer because they are not present in the word embeddings dataframe. So I keep getting a key error.
I need to create a for loop that ignores any words that aren't in the embeddings vocabulary but still prints the sentiment score for each string otherwise. How should I do this?
At your sentiment functions you can use try/except concept so you can define what to do if an exception raises. It's not going to be perfect example because dont know what your functions do actually but your can try;
def text_sentiment(text):
try:
tokens = [token.casefold() for token in TOKEN_RE.findall(text)]
sentiments = words_sentiment(tokens)
return sentiments['sentiment'].mean()
except KeyError:
pass

IndexSlice on a datetime multindex not working, but doesn't seem different from a properly-working toy equivalent

I'm used to using IndexSlice on datetime indices. This is a toy equivalent of my multindex DataFrame and you can see the slicing works
#slicing works on a simple DateTime index
qf = pd.DataFrame(index=pd.date_range(start="1Jan2019",freq="d",periods=30))
qf.loc[idx['2019-1-15':None]] #works
#the same slicing works on a multindex
qf.reset_index(inplace=True)
qf['foo']="bar"
qf['other']=range(len(qf))
qf['filler']="egbdf"
qf.set_index(['index','foo', 'other'], inplace=True)
qf.loc[idx['2019-1-15':'2019-1-20',:,:],:] #wrks
qf.loc[idx['2019-1-15':None,'bar',:],:] #works
But something is going on with a my real DataFrame. I cannot see what the difference is.
xf.loc[idx['2019-5-1':'2019-6-1',"squat",:],:] # This works ok
xf.loc[idx['2019-5-1':None,"squat",:],:] # This fails
The error I get when I slice with a '2019-5-1':None is
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-280-b0dce8e9e337> in <module>
1 xf.loc[idx['2019-5-1':'2019-6-1',"squat",:],:] # This works ok
----> 2 xf.loc[idx['2019-5-1':None,"squat",:],:] # This fails
3 #xf
C:\ProgramData\Anaconda3\envs\nambu\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
1492 except (KeyError, IndexError, AttributeError):
1493 pass
-> 1494 return self._getitem_tuple(key)
1495 else:
1496 # we by definition only have the 0th axis
C:\ProgramData\Anaconda3\envs\nambu\lib\site-packages\pandas\core\indexing.py in _getitem_tuple(self, tup)
866 def _getitem_tuple(self, tup):
867 try:
--> 868 return self._getitem_lowerdim(tup)
869 except IndexingError:
870 pass
C:\ProgramData\Anaconda3\envs\nambu\lib\site-packages\pandas\core\indexing.py in _getitem_lowerdim(self, tup)
967 # we may have a nested tuples indexer here
968 if self._is_nested_tuple_indexer(tup):
--> 969 return self._getitem_nested_tuple(tup)
970
971 # we maybe be using a tuple to represent multiple dimensions here
C:\ProgramData\Anaconda3\envs\nambu\lib\site-packages\pandas\core\indexing.py in _getitem_nested_tuple(self, tup)
1046
1047 current_ndim = obj.ndim
-> 1048 obj = getattr(obj, self.name)._getitem_axis(key, axis=axis)
1049 axis += 1
1050
C:\ProgramData\Anaconda3\envs\nambu\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
1904 # nested tuple slicing
1905 if is_nested_tuple(key, labels):
-> 1906 locs = labels.get_locs(key)
1907 indexer = [slice(None)] * self.ndim
1908 indexer[axis] = locs
C:\ProgramData\Anaconda3\envs\nambu\lib\site-packages\pandas\core\indexes\multi.py in get_locs(self, seq)
2774 # a slice, include BOTH of the labels
2775 indexer = _update_indexer(_convert_to_indexer(
-> 2776 self._get_level_indexer(k, level=i, indexer=indexer)),
2777 indexer=indexer)
2778 else:
C:\ProgramData\Anaconda3\envs\nambu\lib\site-packages\pandas\core\indexes\multi.py in _get_level_indexer(self, key, level, indexer)
2635 # note that the stop ALREADY includes the stopped point (if
2636 # it was a string sliced)
-> 2637 return convert_indexer(start.start, stop.stop, step)
2638
2639 elif level > 0 or self.lexsort_depth == 0 or step is not None:
AttributeError: 'int' object has no attribute 'stop'
I cannot see any material difference between the toy index and the real index, and I cannot see how the error message results to passing None into the slicer.
========================================================
I figured out why it works/doesn't work in different examples.
The code works ok when the index is entirely dates. But if the index has datetimes in it, it fails.
#this index is solely dates, not dateTimes, and everything works
dt_index = pd.date_range(start="1jan2019",periods=100,freq="d")
zf = pd.DataFrame(index=dt_index)
zf['foo']=10
zf['bar']="squat"
zf['zaa']=range(len(dt_index))
zf.index.name="date"
zf = zf.reset_index().set_index(["date", "bar", "zaa"])
zf.loc[idx['2019-1-1':'2019-1-3',"squat",:],:] # This works ok
zf.loc[idx['2019-1-1':,"squat",:],:] # This works
zf.loc[idx['2019-1-1':None,'squat',:,:],:] # This works
The failing example:
dt_index = pd.date_range(start="1jan2019 00:15:33",periods=100,freq="h")
zf = pd.DataFrame(index=dt_index)
zf['foo']=10
zf['bar']="squat"
zf['zaa']=range(len(dt_index))
zf.index.name="date"
zf = zf.reset_index().set_index(["date", "bar", "zaa"])
zf.loc[idx['2019-1-1':'2019-1-3',"squat",:],:] # This works ok
#zf.loc[idx['2019-1-1':,"squat",:],:] # This fails AttributeError: 'int' object has no attribute 'stop'
#zf.loc[idx['2019-1-1':None,'squat',:,:],:] # AttributeError: 'int' object has no attribute 'stop'
This seems like a bug. According to this discussion, check line 2614-2637 of the multi.py of pandas package:
try:
if key.start is not None:
start = level_index.get_loc(key.start)
else:
start = 0
if key.stop is not None:
stop = level_index.get_loc(key.stop)
else:
stop = len(level_index) - 1
step = key.step
except KeyError:
# we have a partial slice (like looking up a partial date
# string)
start = stop = level_index.slice_indexer(key.start, key.stop,
key.step, kind='loc')
step = start.step
if isinstance(start, slice) or isinstance(stop, slice):
# we have a slice for start and/or stop
# a partial date slicer on a DatetimeIndex generates a slice
# note that the stop ALREADY includes the stopped point (if
# it was a string sliced)
return convert_indexer(start.start, stop.stop, step)
The stop would always be an int, because the endpoint is None. But the start is different for qf and xf. The datetime_index of qf has a resolution of 1day, and qf.index.levels[0].get_loc('2019-01-17') would be an 'int'. But the resolution of xf is 0.001S, and xf.index.levels[0].get_loc('2019-01-17') would be a slice, which result the calling of stop.stop, while the stop being an int.
As a work around, you can use a very large date instead of None:
xf.loc[idx['2019-5-1':'2222',"squat",:],:]

join two tuples with different index and different shape

I have two tuples which i like to have in one but i always get a lot of NaN's when I try to concat, merge or join them.
my tuple row looks like this :
and my tuple row_ges looks like this (shape: 5 rows, 6 columns):
I would like to have this in one dataframe like this shape (5 rows,301 columns):
i tried
result=row_ges+row
result_1=row_ges.append(row,ignore_index=True)
result_2 = pd.concat([row_ges,row], axis=1, ignore_index=True)
but i always get a shape like (10rows, 301 columns) and for result_2 i get an error code i don't understand:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-209-cdc656a8e828> in <module>()
4 result=row_ges+row
5 result_1=row_ges.append(row,ignore_index=True)
----> 6 result_2 = pd.concat([row_ges,row], axis=1, ignore_index=True)
7
8 #gesamt = pd.DataFrame()
/usr/local/lib/python3.6/site-packages/pandas/core/reshape/concat.py in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, sort, copy)
224 verify_integrity=verify_integrity,
225 copy=copy, sort=sort)
--> 226 return op.get_result()
227
228
/usr/local/lib/python3.6/site-packages/pandas/core/reshape/concat.py in get_result(self)
421 new_data = concatenate_block_managers(
422 mgrs_indexers, self.new_axes, concat_axis=self.axis,
--> 423 copy=self.copy)
424 if not self.copy:
425 new_data._consolidate_inplace()
/usr/local/lib/python3.6/site-packages/pandas/core/internals.py in concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy)
5423 blocks.append(b)
5424
-> 5425 return BlockManager(blocks, axes)
5426
5427
/usr/local/lib/python3.6/site-packages/pandas/core/internals.py in __init__(self, blocks, axes, do_integrity_check)
3280
3281 if do_integrity_check:
-> 3282 self._verify_integrity()
3283
3284 self._consolidate_check()
/usr/local/lib/python3.6/site-packages/pandas/core/internals.py in _verify_integrity(self)
3491 for block in self.blocks:
3492 if block._verify_integrity and block.shape[1:] != mgr_shape[1:]:
-> 3493 construction_error(tot_items, block.shape[1:], self.axes)
3494 if len(self.items) != tot_items:
3495 raise AssertionError('Number of manager items must equal union of '
/usr/local/lib/python3.6/site-packages/pandas/core/internals.py in construction_error(tot_items, block_shape, axes, e)
4841 raise ValueError("Empty data passed with indices specified.")
4842 raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
-> 4843 passed, implied))
4844
4845
ValueError: Shape of passed values is (301, 29), indices imply (301, 9)
row.reset_index(drop=True, inplace=True)
row_ges.reset_index(drop=True, inplace=True)
result = pd.concat([row_ges,row], axis=1)
The problem was the different index. With this code it is working.

Python - Save dataframe to CSV "too many indices for array" error

I am trying to save a dataframe as CSV and get a "too many indices for array" error. The code used for the save is-
df.to_csv('CCS_Matrix.csv')
The dataframe looks like this
Var10 Var100 Var101
0 0 1 1
1 0 0 1
2 0 1 0
There are 250 columns and about 10 million rows in the dataset.
The dtypes for the dataframe are
Var10 int64
Var100 int64
Var101 int64
etc.
All the dtypes are the same for the 250 columns.
Here is the full output of the error message
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-16-37cbe55e6c0d> in <module>()
----> 1 df.to_csv('CCS_Matrix.csv', encoding='utf-8')
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in to_csv(self, path_or_buf, sep, na_rep, float_format, columns, header, index, index_label, mode, encoding, compression, quoting, quotechar, line_terminator, chunksize, tupleize_cols, date_format, doublequote, escapechar, decimal)
1401 doublequote=doublequote,
1402 escapechar=escapechar, decimal=decimal)
-> 1403 formatter.save()
1404
1405 if path_or_buf is None:
~/anaconda3/lib/python3.6/site-packages/pandas/io/formats/format.py in save(self)
1590 self.writer = csv.writer(f, **writer_kwargs)
1591
-> 1592 self._save()
1593
1594 finally:
~/anaconda3/lib/python3.6/site-packages/pandas/io/formats/format.py in _save(self)
1691 break
1692
-> 1693 self._save_chunk(start_i, end_i)
1694
1695 def _save_chunk(self, start_i, end_i):
~/anaconda3/lib/python3.6/site-packages/pandas/io/formats/format.py in _save_chunk(self, start_i, end_i)
1705 decimal=self.decimal,
1706 date_format=self.date_format,
-> 1707 quoting=self.quoting)
1708
1709 for col_loc, col in zip(b.mgr_locs, d):
~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in to_native_types(self, slicer, na_rep, quoting, **kwargs)
611 values = self.values
612 if slicer is not None:
--> 613 values = values[:, slicer]
614 mask = isnull(values)
615
~/anaconda3/lib/python3.6/site-packages/pandas/core/sparse/array.py in __getitem__(self, key)
417 return self._get_val_at(key)
418 elif isinstance(key, tuple):
--> 419 data_slice = self.values[key]
420 else:
421 if isinstance(key, SparseArray):
IndexError: too many indices for array
Could you print out type(df)?
I've noted this problem in SparseDataFrames here.
I was able to solve the problem by calling .to_dense() on the SparseDataFrame, yielding a traditional DataFrame. Worked fine after that. Clearly that's not ideal for memory reasons, but at least it works in the short term.
The pandas team has responded that it is indeed a bug.
you can try another option to save as csv '.toCSV('name.csv)'. That can give you a different error message like ('SparseDataFrame' object has no attribute 'toCSV')
So the problem was solved by turning dataframe to dense dataframe
df.to_dense().to_csv("submission.csv", index = False, sep=',', encoding='utf-8')

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

Using Python, I am struggling to merge 208 CSV files into one dataframe. (My files names are Customer_1.csv, Customer_2.csv,,, and Customer_208.csv)
Following are my codes,
%matplotlib inline
import pandas as pd
df_merged = pd.concat([pd.read_csv('data_TVV1/Customer_{0}.csv'.format(i), names = ['Time', 'Energy_{0}'.format(i)], parse_dates=['Time'], index_col=['Time'], skiprows=1) for i in range(1, 209)], axis=1)
I got an error saying,
InvalidIndexError Traceback (most recent call last)
<ipython-input-4-a4d19b3c2a3e> in <module>()
----> 1 df_merged = pd.concat([pd.read_csv('data_TVV1/Customer_{0}.csv'.format(i), names = ['Time', 'Energy_{0}'.format(i)], parse_dates=['Time'], index_col=['Time'], skiprows=1) for i in range(1, 209)], axis=1)
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/tools/merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, copy)
752 keys=keys, levels=levels, names=names,
753 verify_integrity=verify_integrity,
--> 754 copy=copy)
755 return op.get_result()
756
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/tools/merge.pyc in __init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index, verify_integrity, copy)
884 self.copy = copy
885
--> 886 self.new_axes = self._get_new_axes()
887
888 def get_result(self):
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_new_axes(self)
944 if i == self.axis:
945 continue
--> 946 new_axes[i] = self._get_comb_axis(i)
947 else:
948 if len(self.join_axes) != ndim - 1:
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_comb_axis(self, i)
970 raise TypeError("Cannot concatenate list of %s" % types)
971
--> 972 return _get_combined_index(all_indexes, intersect=self.intersect)
973
974 def _get_concat_axis(self):
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/core/index.pyc in _get_combined_index(indexes, intersect)
5730 index = index.intersection(other)
5731 return index
-> 5732 union = _union_indexes(indexes)
5733 return _ensure_index(union)
5734
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/core/index.pyc in _union_indexes(indexes)
5759
5760 if hasattr(result, 'union_many'):
-> 5761 return result.union_many(indexes[1:])
5762 else:
5763 for other in indexes[1:]:
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/tseries/index.pyc in union_many(self, others)
847 else:
848 tz = this.tz
--> 849 this = Index.union(this, other)
850 if isinstance(this, DatetimeIndex):
851 this.tz = tz
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/core/index.pyc in union(self, other)
1400 result.extend([x for x in other.values if x not in value_set])
1401 else:
-> 1402 indexer = self.get_indexer(other)
1403 indexer, = (indexer == -1).nonzero()
1404
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/core/index.pyc in get_indexer(self, target, method, limit)
1685
1686 if not self.is_unique:
-> 1687 raise InvalidIndexError('Reindexing only valid with uniquely'
1688 ' valued Index objects')
1689
InvalidIndexError: Reindexing only valid with uniquely valued Index objects
Do you have any idea to solve this problem???..
Your code works on a small sample of five files that I used for testing (each file containing two columns and three row). ONLY FOR DEBUGGING, try to write this in a for loop. First, before the loop, read all of the files into the list. Then loop again and append each one using a try/except block to catch the errors. Finally, print the problem files and investigate.
# First, read all the files into a list.
files_in = [pd.read_csv('data_TVV1/Customer_{0}.csv'.format(i),
names = ['Time', 'Energy_{0}'.format(i)],
parse_dates=['Time'],
index_col=['Time'],
skiprows=1)
for i in range(1, 209)]
df = pd.DataFrame()
errors = []
# Try to append each file to the dataframe.
for i i range(1, 209):
try:
df = pd.concat([df, files_in[i - 1]], axis=1)
except:
errors.append(i)
# Print files containing errors.
for error in errors:
print(files_in[error])

Categories