pandas error: slice found as index? - python

The code below breaks on the simplest of lines, where I thought I was just adding a column called 'year' with the constant value of what is in the year variable. My packages are up to date with Anaconda 2.3.
Why is this indexing wrong?
The code:
# -*- coding: utf-8 -*-
import iopro
from pandas import *
for year in xrange(2005,2013):
for month in xrange(1,13):
if year == 2005 and month < 7:
continue
filename = 'SOMEPATH' + str(year) + '_mon'+ str(month) +'.txt'
adapter = iopro.text_adapter(filename,parser='csv',field_names=True,output='dataframe',delimiter='\t')
monthly = adapter[['var1','var2','var3']][:]
monthly['year']=year
The error message is:
Traceback (most recent call last):
File "drugs.py", line 21, in <module>
monthly['year']=year
File "/home/seidav/anaconda/lib/python2.7/site-packages/pandas/core/frame.py", line 2127, in __setitem__
self._set_item(key, value)
File "/home/seidav/anaconda/lib/python2.7/site-packages/pandas/core/frame.py", line 2205, in _set_item
NDFrame._set_item(self, key, value)
File "/home/seidav/anaconda/lib/python2.7/site-packages/pandas/core/generic.py", line 1196, in _set_item
self._data.set(key, value)
File "/home/seidav/anaconda/lib/python2.7/site-packages/pandas/core/internals.py", line 2980, in set
loc = self.items.get_loc(item)
File "/home/seidav/anaconda/lib/python2.7/site-packages/pandas/core/index.py", line 1572, in get_loc
return self._engine.get_loc(_values_from_object(key))
File "pandas/index.pyx", line 134, in pandas.index.IndexEngine.get_loc (pandas/index.c:3824)
TypeError: expected string or Unicode object, slice found

Related

KeyError in pandas assign of masked df

I inherited a code base which relies a lot on DataFrame.assign and dict unpacking for arguments, which is not something I have seen a lot before.
I was doing some tests today, I guess I must have hit an edge case, and I have been looking for an explanation and/or a solution to this for the last few hours.
I cannot share the data, but I have managed to create the following MRE.
import sys
import pandas as pd
print("python", sys.version)
print("pandas", pd.__version__)
# template df for the output format
def template_df():
return pd.DataFrame(
columns=["id", "subid", "empty", "dat1", "dat2", "dat3", "dat4"]
)
# input data looks like this
df = pd.DataFrame({
"id1": [None, None, "0039"],
"id2": ["10", "12", "a1"],
"dat": [601, 482, 890],
})
# filter on id2 like 'a%'
m1 = df["id2"].str.startswith("a")
# start building output with input data and constant data
output = template_df().assign(**{
"id": df.loc[m1, "id2"],
"subid": df.loc[m1, "id1"],
"dat1": df.loc[m1, "dat"],
"dat2": "constant2",
})
# filter for id1 = '0039'
m2 = output["subid"].str.match("0039")
# add data for id1 = '0039' only
output[m2] = output[m2].assign(**{"dat3": "dependent3", "dat4": "dependent4"})
When I execute the code above, I get:
% python soq.py
python 3.9.12 (main, Mar 26 2022, 15:51:15)
[Clang 13.1.6 (clang-1316.0.21.2)]
pandas 1.4.1
Traceback (most recent call last):
File "/path/to/src/dir/soq.py", line 35, in <module>
output[m2] = output[m2].assign(**{"dat3": "dependent3", "dat4": "dependent4"})
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/frame.py", line 3643, in __setitem__
self._setitem_array(key, value)
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/frame.py", line 3678, in _setitem_array
self.iloc[indexer] = value
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/indexing.py", line 716, in __setitem__
iloc._setitem_with_indexer(indexer, value, self.name)
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/indexing.py", line 1688, in _setitem_with_indexer
self._setitem_with_indexer_split_path(indexer, value, name)
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/indexing.py", line 1724, in _setitem_with_indexer_split_path
self._setitem_with_indexer_frame_value(indexer, value, name)
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/indexing.py", line 1813, in _setitem_with_indexer_frame_value
self._setitem_single_column(loc, val, pi)
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/indexing.py", line 1875, in _setitem_single_column
ser = value[np.argsort(pi)]
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/series.py", line 984, in __getitem__
return self._get_with(key)
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/series.py", line 1019, in _get_with
return self.loc[key]
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/indexing.py", line 967, in __getitem__
return self._getitem_axis(maybe_callable, axis=axis)
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/indexing.py", line 1191, in _getitem_axis
return self._getitem_iterable(key, axis=axis)
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/indexing.py", line 1132, in _getitem_iterable
keyarr, indexer = self._get_listlike_indexer(key, axis)
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/indexing.py", line 1327, in _get_listlike_indexer
keyarr, indexer = ax._get_indexer_strict(key, axis_name)
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/indexes/base.py", line 5782, in _get_indexer_strict
self._raise_if_missing(keyarr, indexer, axis_name)
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/indexes/base.py", line 5842, in _raise_if_missing
raise KeyError(f"None of [{key}] are in the [{axis_name}]")
KeyError: "None of [Int64Index([0], dtype='int64')] are in the [index]"
You are not using assign() properly. You should instead use .loc:
output.loc[m2, ['dat3', 'dat4']] = ["dependent3", "dependent4"]
See this for more info.

Getting KeyError using Pandas when accessing .csv files

For some reason pandas is throwing an error when looking through some .csv stock data I have. Here is the error:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 3078, in get_loc
return self._engine.get_loc(key)
File "pandas/_libs/index.pyx", line 140, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 162, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1492, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1500, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Date'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "./python-for-finance-7.py", line 75, in
compile_data()
File "./python-for-finance-7.py", line 59, in compile_data
df.set_index('Date', inplace=True)
File "/usr/local/lib/python3.7/site-packages/pandas/core/frame.py", >line 3909, in set_index
level = frame[col]._values
File "/usr/local/lib/python3.7/site-packages/pandas/core/frame.py", >line 2688, in getitem
return self._getitem_column(key)
File "/usr/local/lib/python3.7/site-packages/pandas/core/frame.py", >line 2695, in _getitem_column
return self._get_item_cache(key)
File "/usr/local/lib/python3.7/site-packages/pandas/core/generic.py", line 2489, in _get_item_cache
values = self._data.get(item)
File "/usr/local/lib/python3.7/site->packages/pandas/core/internals.py", line 4115, in get
loc = self.items.get_loc(item)
File "/usr/local/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 3080, in get_loc> return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 140, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 162, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1492, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1500, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Date'
to this code:
import bs4 as bs
import datetime as dt
import os
import pandas as pd
import pandas_datareader.data as web
import pickle
import requests
def compile_data():
with open("sp500tickers.pickle","rb") as f:
tickers = pickle.load(f)
main_df = pd.DataFrame()
for count,ticker in enumerate(tickers):
df = pd.read_csv('stock_dfs/{}.csv'.format(ticker),
delimiter=',', encoding="utf-8-sig")
df.set_index('Date', inplace=True)
df.rename(columns = {'Adj Close':ticker}, inplace=True)
df.drop(['High','Low','Open','Close','Volume'], 1, inplace=True)
if main_df.empty:
main_df = df
else:
main_df = main_df.join(df, how='outer')
print(count)
print(main_df.head())
main_df.to_csv('sp500_joined_closes.csv')
compile_data()
The data in the CSV files is arranged like this:
Date High Low Open Close Volume Adj. Close
yyyy-mm-dd $$ $$ $$ $$ $$ $$
I tried changing the casing of Date (ie changing Date to date) but it just moves on to throw another
KeyError:"['High', 'Low', 'Open', 'Close', 'Volume'] not found in axis
Can someone please help??
It looks like you're using the wrong delineator. The file is white-space delineated, not comma delineated.
Try using a whitespace delineator:
df = pd.read_csv('stock_dfs/{}.csv'.format(ticker),
delimiter=r'\s+', encoding="utf-8-sig")
In my case, I didn't have any entries when setting the index, the data frame was empty.
It's worth checking
if len(df) > 0:
before setting the index

KeyException in Python Pandas

I am receiving the following error related to a Key Error. I have a large data set (in the realm of 10 million records) and I am trying to filter only the records that contain a key word in the 'tags' field. I am able to compare for matching key words easily, but to parse for including key words seems to be quite difficult and any of the methods I have tried that I found on SO throw an error. I am new to Pandas so please forgive me if I am committing a cardinal sin. (Took BigData in university and we worked mostly in Spark. I realize the code is a bit hacky right now just trying to get it to function)
Notes: 1 The data is stored across quarterly files so I am iterating over the files and con-concatenating the results (which is the reason for the index and the counter) 2 I commented out the lines that allow me to parse for exact matches (#is_goodwill = data_frame['tag'] == word_of_interest && #good_will_relation = data_frame[is_goodwill])
Goal: Filter for records containing key word word_of_interest
It does not have to be an exact match to the key word, but rather contain the keyword. Code is below the error
Error
Traceback (most recent call last):
File "C:\Users\tyler\Anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 2525, in get_loc
return self._engine.get_loc(key)
File "pandas/_libs/index.pyx", line 117, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 139, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1265, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1273, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'tags'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "parsePandasSECData.py", line 64, in <module>
main()
File "parsePandasSECData.py", line 42, in main
good_will_relation = data_frame[data_frame['tags'].str.contains(word_of_interest)]
File "C:\Users\tyler\Anaconda3\lib\site-packages\pandas\core\frame.py", line 2139, in __getitem__
return self._getitem_column(key)
File "C:\Users\tyler\Anaconda3\lib\site-packages\pandas\core\frame.py", line 2146, in _getitem_column
return self._get_item_cache(key)
File "C:\Users\tyler\Anaconda3\lib\site-packages\pandas\core\generic.py", line 1842, in _get_item_cache
values = self._data.get(item)
File "C:\Users\tyler\Anaconda3\lib\site-packages\pandas\core\internals.py", line 3843, in get
loc = self.items.get_loc(item)
File "C:\Users\tyler\Anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 2527, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 117, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 139, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1265, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1273, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'tags'
Code
import pandas as pd
import os.path
import time
import csv
def main():
start_time = time.time()
file_path = "C:/Users/TA/Desktop/Data/"
word_of_interest = "ShareholdersEquity"
NUM_FILE_NAME = "number.csv"
SUB_FILE_NAME = "subnumber.csv"
quarterly_list = ['Q1', 'Q2', 'Q3', 'Q4']
all_concat_data = None
pd.set_option('display.max_row', 1000)
for counter in range(9,19):
for index in range(len(quarterly_list)):
#iterates over all file locations
num_file_path = file_path + quarterly_list[index] + str(counter) + '/' + NUM_FILE_NAME
sub_file_path = file_path + quarterly_list[index] + str(counter) + '/' + SUB_FILE_NAME
if os.path.exists(num_file_path) and os.path.exists(sub_file_path):
print('Starting ' + quarterly_list[index] + str(counter) + ' Data')
#Load data
data_frame = pd.read_csv(num_file_path, dtype={'adsh': str, 'tag': str, 'version coreg': str, 'ddate': int, 'qtrs': int, 'uom': str, 'value': float, 'footnote': str}, \
header=0, delimiter='\t', low_memory= False, encoding= 'ISO-8859-1')
#Comparative Data
transaction_descriptions = pd.read_csv(sub_file_path, dtype={'adsh': str}, header = 0, delimiter = '\t', low_memory=False, encoding='ISO-8859-1')
#is_goodwill = data_frame['tag'] == word_of_interest
#good_will_relation = data_frame[is_goodwill]
good_will_relation = data_frame[data_frame['tags'].str.contains(word_of_interest)]
captured_data = good_will_relation.merge(transaction_descriptions, how='inner', left_on='adsh', right_on='adsh')
if all_concat_data is not None:
all_concat_data = pd.concat([all_concat_data,captured_data])
else:
all_concat_data = captured_data
else:
print(quarterly_list[index] + str(counter) + ' Does not exist...Skipping')
print('Starting Writer operation')
writer = pd.ExcelWriter('output.xlsx')
all_concat_data.to_excel(writer, 'Sheet1')
print("--- %s seconds ---" % (time.time() - start_time))
if __name__ == "__main__":
main()

Pandas hashtable with gives key error:0

I am trying to get the same elements of two pandas data table, with indexing the datas and merge it. I use it for a very large amount of data(millions). The frist table (df) is constatn, and the second(d2) is changing in every loop, with the new elements will be merged with the first table.
here is my code for this process:
df = pd.read_csv("inputfile.csv",header=None)
d1 = pd.DataFrame(df).set_index(0)
for i in range(0, len(df)):
try:
follower_id=twitter.get_followers_ids(user_id=df.iloc[i][0],cursor=next_cursor)
f=follower_id['ids']
json.dumps(f)
d2 = pd.DataFrame(f).set_index(0)
match_result = pd.merge(d1,d2,left_index=True,right_index=True)
fk=[df.iloc[i][0] for number in range(len(match_result))]
DF = pd.DataFrame(fk)
DF.to_csv(r'output1.csv',header=None,sep=' ',index=None)
match_result.to_csv(r'output2.csv', header=None, sep=' ')
I have experienced, that this code, runs well for a while, but after that- probably it is relatad to the second databasses size wich is change every loop- the program gives me the following error message, and stop running:
Traceback (most recent call last):
File "halozat3.py", line 39, in <module>
d2 = pd.DataFrame(f).set_index(0) #1Trump koveto kovetolistaja
File "/usr/lib/python2.7/dist-packages/pandas/core/frame.py", line 2372, in set_index
level = frame[col].values
File "/usr/lib/python2.7/dist-packages/pandas/core/frame.py", line 1678, in __getitem__
return self._getitem_column(key)
File "/usr/lib/python2.7/dist-packages/pandas/core/frame.py", line 1685, in _getitem_column
return self._get_item_cache(key)
File "/usr/lib/python2.7/dist-packages/pandas/core/generic.py", line 1052, in _get_item_cache
values = self._data.get(item)
File "/usr/lib/python2.7/dist-packages/pandas/core/internals.py", line 2565, in get
loc = self.items.get_loc(item)
File "/usr/lib/python2.7/dist-packages/pandas/core/index.py", line 1181, in get_loc
return self._engine.get_loc(_values_from_object(key))
File "index.pyx", line 129, in pandas.index.IndexEngine.get_loc (pandas/index.c:3656)
File "index.pyx", line 149, in pandas.index.IndexEngine.get_loc (pandas/index.c:3534)
File "hashtable.pyx", line 381, in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:7035)
File "hashtable.pyx", line 387, in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:6976)
KeyError: 0
What could be the problem?
Have you only one row in your dataframe?
You must write as many rows as you like
Look

"Already tz-aware" error when reading h5 file using pandas, python 3 (but not 2)

I have an h5 store named weather.h5. My default Python environment is 3.5.2. When I try to read this store I get TypeError: Already tz-aware, use tz_convert to convert.
I've tried both pd.read_hdf('weather.h5','weather_history') and pd.io.pytables.HDFStore('weather.h5')['weather_history], but I get the error no matter what.
I can open the h5 in a Python 2.7 environment. Is this a bug in Python 3 / pandas?
I have the same issue. I'm using Anaconda Python: 3.4.5 and 2.7.3. Both are using pandas 0.18.1.
Here is a reproducible example:
generate.py (to be executed with Python2):
import pandas as pd
from pandas import HDFStore
index = pd.DatetimeIndex(['2017-06-20 06:00:06.984630-05:00', '2017-06-20 06:03:01.042616-05:00'], dtype='datetime64[ns, CST6CDT]', freq=None)
p1 = [0, 1]
p2 = [0, 2]
# Saving any of these dataframes cause issues
df1 = pd.DataFrame({"p1":p1, "p2":p2}, index=index)
df2 = pd.DataFrame({"p1":p1, "p2":p2, "i":index})
store = HDFStore("./test_issue.h5")
store['df'] = df1
#store['df'] = df2
store.close()
read_issue.py:
import pandas as pd
from pandas import HDFStore
store = HDFStore("./test_issue.h5", mode="r")
df = store['/df']
store.close()
print(df)
Running read_issue.py in Python2 has no issues and produces this output:
p1 p2
2017-06-20 11:00:06.984630-05:00 0 0
2017-06-20 11:03:01.042616-05:00 1 2
But running it in Python3 produces Error with this traceback:
Traceback (most recent call last):
File "read_issue.py", line 5, in
df = store['df']
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 417, in getitem
return self.get(key)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 634, in get
return self._read_group(group)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 1272, in _read_group
return s.read(**kwargs)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 2779, in read
ax = self.read_index('axis%d' % i)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 2367, in read_index
_, index = self.read_index_node(getattr(self.group, key))
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 2492, in read_index_node
_unconvert_index(data, kind, encoding=self.encoding), **kwargs)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/indexes/base.py", line 153, in new
result = DatetimeIndex(data, copy=copy, name=name, **kwargs)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/util/decorators.py", line 91, in wrapper
return func(*args, **kwargs)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/tseries/index.py", line 321, in new
raise TypeError("Already tz-aware, use tz_convert "
TypeError: Already tz-aware, use tz_convert to convert.
Closing remaining open files:./test_issue.h5...done
So, there is an issue with indices. However, if you save df2 in generate.py (datetime as a column, not as an index), then Python3 in read_issue.py produces a different error:
Traceback (most recent call last):
File "read_issue.py", line 5, in
df = store['/df']
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 417, in getitem
return self.get(key)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 634, in get
return self._read_group(group)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 1272, in _read_group
return s.read(**kwargs)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 2788, in read
placement=items.get_indexer(blk_items))
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/core/internals.py", line 2518, in make_block
return klass(values, ndim=ndim, fastpath=fastpath, placement=placement)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/core/internals.py", line 90, in init
len(self.mgr_locs)))
ValueError: Wrong number of items passed 2, placement implies 1
Closing remaining open files:./test_issue.h5...done
Also, if you execute generate_issue.py in Python3 (saving either df1 or df2), then there is no problem executing read_issue.py in either Python3 or Python2

Categories