KeyException in Python Pandas - python

I am receiving the following error related to a Key Error. I have a large data set (in the realm of 10 million records) and I am trying to filter only the records that contain a key word in the 'tags' field. I am able to compare for matching key words easily, but to parse for including key words seems to be quite difficult and any of the methods I have tried that I found on SO throw an error. I am new to Pandas so please forgive me if I am committing a cardinal sin. (Took BigData in university and we worked mostly in Spark. I realize the code is a bit hacky right now just trying to get it to function)
Notes: 1 The data is stored across quarterly files so I am iterating over the files and con-concatenating the results (which is the reason for the index and the counter) 2 I commented out the lines that allow me to parse for exact matches (#is_goodwill = data_frame['tag'] == word_of_interest && #good_will_relation = data_frame[is_goodwill])
Goal: Filter for records containing key word word_of_interest
It does not have to be an exact match to the key word, but rather contain the keyword. Code is below the error
Error
Traceback (most recent call last):
File "C:\Users\tyler\Anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 2525, in get_loc
return self._engine.get_loc(key)
File "pandas/_libs/index.pyx", line 117, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 139, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1265, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1273, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'tags'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "parsePandasSECData.py", line 64, in <module>
main()
File "parsePandasSECData.py", line 42, in main
good_will_relation = data_frame[data_frame['tags'].str.contains(word_of_interest)]
File "C:\Users\tyler\Anaconda3\lib\site-packages\pandas\core\frame.py", line 2139, in __getitem__
return self._getitem_column(key)
File "C:\Users\tyler\Anaconda3\lib\site-packages\pandas\core\frame.py", line 2146, in _getitem_column
return self._get_item_cache(key)
File "C:\Users\tyler\Anaconda3\lib\site-packages\pandas\core\generic.py", line 1842, in _get_item_cache
values = self._data.get(item)
File "C:\Users\tyler\Anaconda3\lib\site-packages\pandas\core\internals.py", line 3843, in get
loc = self.items.get_loc(item)
File "C:\Users\tyler\Anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 2527, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 117, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 139, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1265, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1273, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'tags'
Code
import pandas as pd
import os.path
import time
import csv
def main():
start_time = time.time()
file_path = "C:/Users/TA/Desktop/Data/"
word_of_interest = "ShareholdersEquity"
NUM_FILE_NAME = "number.csv"
SUB_FILE_NAME = "subnumber.csv"
quarterly_list = ['Q1', 'Q2', 'Q3', 'Q4']
all_concat_data = None
pd.set_option('display.max_row', 1000)
for counter in range(9,19):
for index in range(len(quarterly_list)):
#iterates over all file locations
num_file_path = file_path + quarterly_list[index] + str(counter) + '/' + NUM_FILE_NAME
sub_file_path = file_path + quarterly_list[index] + str(counter) + '/' + SUB_FILE_NAME
if os.path.exists(num_file_path) and os.path.exists(sub_file_path):
print('Starting ' + quarterly_list[index] + str(counter) + ' Data')
#Load data
data_frame = pd.read_csv(num_file_path, dtype={'adsh': str, 'tag': str, 'version coreg': str, 'ddate': int, 'qtrs': int, 'uom': str, 'value': float, 'footnote': str}, \
header=0, delimiter='\t', low_memory= False, encoding= 'ISO-8859-1')
#Comparative Data
transaction_descriptions = pd.read_csv(sub_file_path, dtype={'adsh': str}, header = 0, delimiter = '\t', low_memory=False, encoding='ISO-8859-1')
#is_goodwill = data_frame['tag'] == word_of_interest
#good_will_relation = data_frame[is_goodwill]
good_will_relation = data_frame[data_frame['tags'].str.contains(word_of_interest)]
captured_data = good_will_relation.merge(transaction_descriptions, how='inner', left_on='adsh', right_on='adsh')
if all_concat_data is not None:
all_concat_data = pd.concat([all_concat_data,captured_data])
else:
all_concat_data = captured_data
else:
print(quarterly_list[index] + str(counter) + ' Does not exist...Skipping')
print('Starting Writer operation')
writer = pd.ExcelWriter('output.xlsx')
all_concat_data.to_excel(writer, 'Sheet1')
print("--- %s seconds ---" % (time.time() - start_time))
if __name__ == "__main__":
main()

Related

KeyError: 0 when used in existing function, otherwise the code works fine

I want to do the following:
I have data in long format organized by dates
Sometimes, data is missing as it there is no record of it
I found a solution by interpolating missing data using reindex which works fine when used outside of function, but for some reason, doesn't work when used inside of a function
def sum_customer_portfolio(country, sold_to):
df = pd.merge(etl_customer_portfolio(), etl_week(), how="left", on=["Country", "GCAS"])
df = df.loc[df["Country"].isin(country)]
df = df.loc[df["Sold_to"].isin(sold_to)]
df_week = etl_week()
df_week = df_week.dropna(subset=["Sold_to"])
df_week = df_week[["Week_num", "Date_range"]]
df_week = df_week.drop_duplicates(subset=["Date_range"])
sum_df = pd.merge(df, df_week, how="outer", on=["Week_num", "Date_range"])
sum_df["Stat_unit_qty"] = sum_df["Stat_unit_qty"].fillna(0, axis=0)
sum_df[["Country", "Sold_to", "Customer"]] = sum_df[["Country", "Sold_to", "Customer"]].fillna(method="ffill",
axis=0)
sum_df = sum_df.fillna("DUMMY_NOT_USE").replace("DUMMY_NOT_USE", np.nan)
reindex_subset = sum_df[["GCAS", "Week_num", "Stat_unit_qty"]]
reindex_subset = reindex_subset.dropna()
reindex_subset = reindex_subset.set_index("Week_num")
reindex_subset = (reindex_subset.groupby("GCAS").apply(
lambda x: x.reindex(list(range(reindex_subset.index.min(), reindex_subset.index.max() + 1)), fill_value=0))
.drop("GCAS", axis=1).
reset_index("GCAS").
fillna(0).
reset_index())
reindex_subset = reindex_subset.drop(columns=["Stat_unit_qty"])
final_df = pd.merge(sum_df, reindex_subset, how="outer", on=["GCAS", "Week_num"])
current_date = datetime.now().strftime("%d%m%Y_%H%M%S")
# return sum_df.to_excel(f"CUSTOMER_PORTFOLIO-{current_date}.xlsx", sheet_name="GCAS_SUM", index=False)
return final_df
Code above keeps giving me the following error:
Traceback (most recent call last):
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 3361, in get_loc
return self._engine.get_loc(casted_key)
File "pandas\_libs\index.pyx", line 76, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 103, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 135, in pandas._libs.index.IndexEngine._get_loc_duplicates
File "pandas\_libs\index_class_helper.pxi", line 51, in pandas._libs.index.Float64Engine._maybe_get_bool_indexer
File "pandas\_libs\index.pyx", line 161, in pandas._libs.index.IndexEngine._unpack_bool_indexer
KeyError: 0
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\main.py", line 167, in <module>
sum_customer_portfolio(country=["Croatia", "Slovenia"], sold_to=[2000829798, 2000558171]).to_excel(writer, index=False, sheet_name="GCAS_SUM")
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\main.py", line 113, in sum_customer_portfolio
reindex_subset = (reindex_subset.groupby(["GCAS", "Sold_to"]).apply(
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\groupby\groupby.py", line 1253, in apply
result = self._python_apply_general(f, self._selected_obj)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\groupby\groupby.py", line 1287, in _python_apply_general
keys, values, mutated = self.grouper.apply(f, data, self.axis)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\groupby\ops.py", line 783, in apply
result_values, mutated = splitter.fast_apply(f, sdata, group_keys)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\groupby\ops.py", line 1328, in fast_apply
return libreduction.apply_frame_axis0(sdata, f, names, starts, ends)
File "pandas\_libs\reduction.pyx", line 369, in pandas._libs.reduction.apply_frame_axis0
File "pandas\_libs\reduction.pyx", line 428, in pandas._libs.reduction.BlockSlider.__init__
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\frame.py", line 3430, in __getitem__
indexer = convert_to_index_sliceable(self, key)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexing.py", line 2329, in convert_to_index_sliceable
return idx._convert_slice_indexer(key, kind="getitem")
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\numeric.py", line 242, in _convert_slice_indexer
return self.slice_indexer(key.start, key.stop, key.step, kind=kind)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 5686, in slice_indexer
start_slice, end_slice = self.slice_locs(start, end, step=step)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 5894, in slice_locs
end_slice = self.get_slice_bound(end, "right")
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 5808, in get_slice_bound
raise err
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 5802, in get_slice_bound
slc = self.get_loc(label)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 3363, in get_loc
raise KeyError(key) from err
KeyError: 0
When loading the data directly from Excel (same data that produced by the function), for example, "CUSTOMER_PORTFOLIO-11082021_234057.xlsx" and running the following code:
sum_df = pd.read_excel("CUSTOMER_PORTFOLIO-11082021_234057.xlsx")
reindex_subset = sum_df[["GCAS", "Week_num", "Stat_unit_qty"]]
reindex_subset = reindex_subset.dropna()
reindex_subset = reindex_subset.set_index("Week_num")
reindex_subset = (reindex_subset.groupby("GCAS").apply(
lambda x: x.reindex(list(range(reindex_subset.index.min(), reindex_subset.index.max() + 1)), fill_value=0))
.drop("GCAS", axis=1).
reset_index("GCAS").
fillna(0).
reset_index())
reindex_subset = reindex_subset.drop(columns=["Stat_unit_qty"])
final_df = pd.merge(sum_df, reindex_subset, how="outer", on=["GCAS", "Week_num"])
The code gives me results that I want.
What am I missing? I tried searching for this on SO overflow, but no success as of yet. I have tried resetting index, but unfortunately, it didn't help.
UPDATE: Pasted the full error traceback. Moreover, as I said above, when I run the function without the part of the code that "reindexes" the data, the code works just fine. I have also tried and still no luck:
df_new = df.copy(deep=True)
df_week= df_week.copy(deep=True)
And when I run the "reindex" part of the code on a finished .xlsx, it works just fine, which is strange in itself.

How to fix KeyError: 'Id' in pandas dataframe

I am currently working on a project which uses facial recognition to record class attendance. The second block of codes below are for tracking(recognizing the faces in the video feed) but i keep getting the error in the third block. I think it has to do with my indexing and I'm not sure where the problem is coming from.
This is the code i used forcreating the StudentDetails.csv
res = "Images Saved for ID : " + Id +" Name : "+ name
row = [Id, name]
with open('StudentDetails\StudentDetails.csv','a+') as csvFile:
writer = csv.writer(csvFile)
writer.writerow(row)
csvFile.close()
message.configure(text= res)
else:
if(is_number(Id)):
res = "Enter Alphabetical Name"
message.configure(text= res)
if(name.isalpha or name.isspace()):
res = "Enter Numeric Id"
message.configure(text= res)
This the block of code for tracking
def TrackImages():
recognizer = cv2.face.LBPHFaceRecognizer_create()#cv2.createLBPHFaceRecognizer()
recognizer.read("TrainingImageLabel\Trainner.yml")
harcascadePath = "haarcascade_frontalface_default.xml"
faceCascade = cv2.CascadeClassifier(harcascadePath);
df=pd.read_csv("StudentDetails\StudentDetails.csv")
cam = cv2.VideoCapture(0)
font = cv2.FONT_HERSHEY_SIMPLEX
col_names = ['Id','Name','Date','Time']
attendance = pd.DataFrame(columns = col_names)
while True:
ret, img =cam.read()
gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
faces=faceCascade.detectMultiScale(gray, 1.2,5)
for(x,y,w,h) in faces:
cv2.rectangle(img,(x,y),(x+w,y+h),(225,0,0),2)
Id, conf = recognizer.predict(gray[y:y+h,x:x+w])
if(conf < 50):
ts = time.time()
date = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d')
timeStamp = datetime.datetime.fromtimestamp(ts).strftime('%H:%M:%S')
aa=df.loc[df['Id'] == Id]['Name'].values
tt=str(Id)+"-"+aa
attendance.loc[len(attendance)] = [Id,aa,date,timeStamp]
else:
Id='Unknown'
tt=str(Id)
if(conf > 75):
noOfFile=len(os.listdir("ImagesUnknown"))+1
cv2.imwrite("ImagesUnknown\Image"+str(noOfFile) + ".jpg", im[y:y+h,x:x+w])
cv2.putText(img,str(tt),(x,y+h), font, 1,(255,255,255),2)
attendance=attendance.drop_duplicates(subset=['Id'],keep='first')
cv2.imshow('frame',img)
if (cv2.waitKey(1)==ord('q')):
break
ts = time.time()
date = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d')
timeStamp = datetime.datetime.fromtimestamp(ts).strftime('%H:%M:%S')
Hour,Minute,Second=timeStamp.split(":")
fileName="Attendance\Attendance_"+date+"_"+Hour+"-"+Minute+"-"+Second+".csv"
attendance.to_csv(fileName,index=False)
cam.release()
cv2.destroyAllWindows()
#print(attendance)
res=attendance
message2.configure(text= res)
This is the error i got for this code:
Exception in Tkinter callback
Traceback (most recent call last):
File "C:\Users\IFEANYI\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pandas\core\indexes\base.py", line 2646, in get_loc
return self._engine.get_loc(key)
File "pandas\_libs\index.pyx", line 111, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1619, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1627, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Id'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\IFEANYI\AppData\Local\Programs\Python\Python38-32\lib\tkinter\__init__.py", line 1883, in __call__
return self.func(*args)
File "C:\Users\IFEANYI\Documents\yr 4 semester 2\FR2\ifeanyi train.py", line 184, in TrackImages
aa=df.loc[df['Id'] == Id]['Name'].values
File "C:\Users\IFEANYI\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pandas\core\frame.py", line 2800, in __getitem__
indexer = self.columns.get_loc(key)
File "C:\Users\IFEANYI\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pandas\core\indexes\base.py", line 2648, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas\_libs\index.pyx", line 111, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1619, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1627, in strong textpandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Id'
This is a snapshot of StudentDetails.csv
StudentDetails.csv

Get excel column into a variable

I want to move some xls data into json. I can't just use a ready solution, since this is a bit of a special case.
Here's the excel
Here's the code:
import pandas
xl = pandas.ExcelFile("./data/file.xlsx")
df = xl.parse("2")
x = df["XX"][0]
print(x)
# writing to file
text_file = open("json_files/Output.json", "w")
# text_file.write(json_str)
text_file.close()
Here's the error I'm getting:
Traceback (most recent call last):
File "C:\Users\aironsid\Documents\Capgemini\Excel_to_Json\venv\lib\site-packages\pandas\core\indexes\base.py", line 2646, in get_loc
return self._engine.get_loc(key)
File "pandas\_libs\index.pyx", line 111, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1619, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1627, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'XX'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "excelToJson.py", line 5, in <module>
x = df["XX"][0]
File "C:\Users\aironsid\Documents\Capgemini\Excel_to_Json\venv\lib\site-packages\pandas\core\frame.py", line 2800, in __getitem__
indexer = self.columns.get_loc(key)
File "C:\Users\aironsid\Documents\Capgemini\Excel_to_Json\venv\lib\site-packages\pandas\core\indexes\base.py", line 2648, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas\_libs\index.pyx", line 111, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1619, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1627, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'XX'
It seems to not be able to find the column name.
I'm using this video as reference
import pandas
xl = pandas.ExcelFile("file.xlsx")
# df = xl.parse("Text")
# print(df.columns)
# # x = df["XX"][0]
# # print(x)
df = pandas.Dataframe(xl)
print(df.columns)
# if you can see the columns
print(df["XX"])
# if this is success
dictionary = {"XX": list(df["XX"])}
# writing to file
text_file = open("json_files/Output.json", "w")
# text_file.write(json_str)
text_file.close()
please try this
df = pd.Dataframe(xl)
print(df.columns)
# if you can see the columns
print(df["XX"])
# if this is success
dictionary = {"XX": list(df["XX"])}
As mentioned in comments, you need to translate the starting point of A1 to B7 in your case. This can be achieved with the "skiprows" parameter of pandas.ExcelFile.parse and the index_col parameter:
import pandas
xl = pandas.ExcelFile("path\to\your\file.xlsx")
df = xl.parse("YourSheetName",index_col=1,skiprows=7)
For more documentation/parameters see pandas docs

How to convert csv's to line protocol

How do I display a CSV file in line protocol format, like influxdb uses?
My CSV data as below...
time, avg_FreshOrders, p95_FreshOrders, FreshOrders
1593648000000,1479.08407079646,2589,226
1593475200000,2242.8617021276596,5622,188
1593734400000,1682.3375,2738,160
I am using Python to convert to line protocol as below
import pandas as pd
#convert csv's to line protocol
df_full = pd.read_csv("data/FreshOrders.csv")
df_full["measurement"] = ['FO' for t in range(len(df_full))]
lines = [str(df_full["measurement"][d])
+ ",type=FreshOrders"
+ " "
+ "avg_FreshOrders=" + str(df_full["avg_FreshOrders"][d]) + ","
+ "p95_FreshOrders=" + str(df_full["p95_FreshOrders"][d]) + ","
+ "FreshOrders=" + str(df_full["FreshOrders"][d])
+ " " + str(df_full["time"][d]) for d in range(len(df_full))]
#append lines to a text file with DDL & DML:
thefile = open('data/import.txt', 'a+')
for item in lines:
thefile.write("%s\n" % item)
While running this Python code i am getting below errors.
Traceback (most recent call last):
File "csv_to_line.py", line 6, in <module>
lines = [str(df_full["measurement"][d])
File "csv_to_line.py", line 6, in <listcomp>
lines = [str(df_full["measurement"][d])
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\series.py", line 871, in __getitem__
result = self.index.get_value(self, key)
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\indexes\base.py", line 4405, in get_value
return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
File "pandas\_libs\index.pyx", line 80, in pandas._libs.index.IndexEngine.get_value
File "pandas\_libs\index.pyx", line 90, in pandas._libs.index.IndexEngine.get_value
File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 997, in pandas._libs.hashtable.Int64HashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1004, in pandas._libs.hashtable.Int64HashTable.get_item
KeyError: 0
Can someone please help me to resolve this issue?
If you have problem with the keyword, you can use index instead.
import pandas as pd
# convert csv's to line protocol
df_full = pd.read_csv("haha.csv")
df_full["measurement"] = ['FO' for t in range(len(df_full))]
lines = []
for idx in range(len(df_full)):
temp = str(df_full.iloc[idx, -1]) + ",type=FreshOrders" + " " + "avg_FreshOrders=" + str(df_full.iloc[idx, 1]) + ","\
+ "p95_FreshOrders=" + str(df_full.iloc[idx, 2]) + "," + "FreshOrders=" + str(df_full.iloc[idx, 3]) + " " + \
str(df_full.iloc[idx, 0])
lines.append(temp)
# append lines to a text file with DDL & DML:
thefile = open('data/import.txt', 'a+')
for item in lines:
thefile.write("%s\n" % item)

Getting KeyError using Pandas when accessing .csv files

For some reason pandas is throwing an error when looking through some .csv stock data I have. Here is the error:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 3078, in get_loc
return self._engine.get_loc(key)
File "pandas/_libs/index.pyx", line 140, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 162, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1492, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1500, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Date'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "./python-for-finance-7.py", line 75, in
compile_data()
File "./python-for-finance-7.py", line 59, in compile_data
df.set_index('Date', inplace=True)
File "/usr/local/lib/python3.7/site-packages/pandas/core/frame.py", >line 3909, in set_index
level = frame[col]._values
File "/usr/local/lib/python3.7/site-packages/pandas/core/frame.py", >line 2688, in getitem
return self._getitem_column(key)
File "/usr/local/lib/python3.7/site-packages/pandas/core/frame.py", >line 2695, in _getitem_column
return self._get_item_cache(key)
File "/usr/local/lib/python3.7/site-packages/pandas/core/generic.py", line 2489, in _get_item_cache
values = self._data.get(item)
File "/usr/local/lib/python3.7/site->packages/pandas/core/internals.py", line 4115, in get
loc = self.items.get_loc(item)
File "/usr/local/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 3080, in get_loc> return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 140, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 162, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1492, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1500, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Date'
to this code:
import bs4 as bs
import datetime as dt
import os
import pandas as pd
import pandas_datareader.data as web
import pickle
import requests
def compile_data():
with open("sp500tickers.pickle","rb") as f:
tickers = pickle.load(f)
main_df = pd.DataFrame()
for count,ticker in enumerate(tickers):
df = pd.read_csv('stock_dfs/{}.csv'.format(ticker),
delimiter=',', encoding="utf-8-sig")
df.set_index('Date', inplace=True)
df.rename(columns = {'Adj Close':ticker}, inplace=True)
df.drop(['High','Low','Open','Close','Volume'], 1, inplace=True)
if main_df.empty:
main_df = df
else:
main_df = main_df.join(df, how='outer')
print(count)
print(main_df.head())
main_df.to_csv('sp500_joined_closes.csv')
compile_data()
The data in the CSV files is arranged like this:
Date High Low Open Close Volume Adj. Close
yyyy-mm-dd $$ $$ $$ $$ $$ $$
I tried changing the casing of Date (ie changing Date to date) but it just moves on to throw another
KeyError:"['High', 'Low', 'Open', 'Close', 'Volume'] not found in axis
Can someone please help??
It looks like you're using the wrong delineator. The file is white-space delineated, not comma delineated.
Try using a whitespace delineator:
df = pd.read_csv('stock_dfs/{}.csv'.format(ticker),
delimiter=r'\s+', encoding="utf-8-sig")
In my case, I didn't have any entries when setting the index, the data frame was empty.
It's worth checking
if len(df) > 0:
before setting the index

Categories