pandas data frame KeyError oop - python

The purpose of this script is to read a csv file.
The file contains forex data.
The file has 7 columns Date, Time, Open, High, Low, Close and Volume, and around 600k rows.
After scraping the date and time the script must will make some date time calculation like month and day.
Then some technical analysis using TA-LIB library.
Here is the code:
import pandas as pd
import talib
class Data:
def __init__(self):
self.df = pd.DataFrame()
self.names = ['Date', 'Time', 'Open', 'High', 'Low', 'Close', 'Volume']
self.open = self.df['Open'].astype(float)
self.high = self.df['High'].astype(float)
self.low = self.df['Low'].astype(float)
self.close = self.df['Close'].astype(float)
def file(self, file):
self.df = pd.read_csv(file, names=self.names,
parse_dates={'Release Date': ['Date', 'Time']})
return self.df
def date(self):
self.df['Release Date'] = pd.to_datetime(self.df['Release Date'])
def year(self):
self.df['year'] = pd.to_datetime(self.df['Release Date']).dt.year
def month(self):
self.df['year'] = pd.to_datetime(self.df['Release Date']).dt.month
def day(self):
self.df['year'] = pd.to_datetime(self.df['Release Date']).dt.day
def dema(self):
# DEMA - Double Exponential Moving Average
self.df['DEMA'] = talib.DEMA(self.close, timeperiod=30)
def ema(self):
# EMA - Exponential Moving Average
self.df['EMA'] = talib.EMA(self.close, timeperiod=30)
def HT_TRENDLINE(self):
# HT_TRENDLINE - Hilbert Transform - Instantaneous Trendline
self.df['HT_TRENDLINE '] = talib.HT_TRENDLINE(self.close)
def KAMA(self):
# KAMA - Kaufman Adaptive Moving Average
self.df['KAMA'] = talib.KAMA(self.close, timeperiod=30)
def ma(self):
# MA - Moving average
self.df['MA'] = talib.MA(self.close, timeperiod=30, matype=0)
def print(self):
return print(self.df.head())
x = Data()
x.file(r"D:\Projects\Project Forex\USDJPY.csv")
x.print()
Here is the error:
Traceback (most recent call last):
File "C:\Users\Sayed\miniconda3\lib\site-packages\pandas\core\indexes\base.py", line 2646, in get_loc
return self._engine.get_loc(key)
File "pandas\_libs\index.pyx", line 111, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1619, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1627, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Open'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:/Users/Sayed/PycharmProjects/project/Technical Analysis.py", line 55, in <module>
x = Data()
File "C:/Users/Sayed/PycharmProjects/project/Technical Analysis.py", line 9, in __init__
self.open = self.df['Open'].astype(float)
File "C:\Users\Sayed\miniconda3\lib\site-packages\pandas\core\frame.py", line 2800, in __getitem__
indexer = self.columns.get_loc(key)
File "C:\Users\Sayed\miniconda3\lib\site-packages\pandas\core\indexes\base.py", line 2648, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas\_libs\index.pyx", line 111, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1619, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1627, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Open'

In the __init__ function you are initializing empty DataFrame without any columns. But 1 line after, you are trying to convert Open column of the DataFrame to float.
def __init__(self):
self.df = pd.DataFrame() # No columns
self.names = ['Date', 'Time', 'Open', 'High', 'Low', 'Close', 'Volume']
self.open = self.df['Open'].astype(float) # ERROR: 'Open' column does not exist
self.high = self.df['High'].astype(float)
self.low = self.df['Low'].astype(float)
self.close = self.df['Close'].astype(float)
Change you init function to this and it should work!
def __init__(self):
self.names = ['Date', 'Time', 'Open', 'High', 'Low', 'Close', 'Volume']
self.df = pd.DataFrame(columns=self.names) # Empty dataframe with columns
self.open = self.df['Open'].astype(float) # Now 'Open' column exists
self.high = self.df['High'].astype(float)
self.low = self.df['Low'].astype(float)
self.close = self.df['Close'].astype(float)

Related

KeyError: 0 when used in existing function, otherwise the code works fine

I want to do the following:
I have data in long format organized by dates
Sometimes, data is missing as it there is no record of it
I found a solution by interpolating missing data using reindex which works fine when used outside of function, but for some reason, doesn't work when used inside of a function
def sum_customer_portfolio(country, sold_to):
df = pd.merge(etl_customer_portfolio(), etl_week(), how="left", on=["Country", "GCAS"])
df = df.loc[df["Country"].isin(country)]
df = df.loc[df["Sold_to"].isin(sold_to)]
df_week = etl_week()
df_week = df_week.dropna(subset=["Sold_to"])
df_week = df_week[["Week_num", "Date_range"]]
df_week = df_week.drop_duplicates(subset=["Date_range"])
sum_df = pd.merge(df, df_week, how="outer", on=["Week_num", "Date_range"])
sum_df["Stat_unit_qty"] = sum_df["Stat_unit_qty"].fillna(0, axis=0)
sum_df[["Country", "Sold_to", "Customer"]] = sum_df[["Country", "Sold_to", "Customer"]].fillna(method="ffill",
axis=0)
sum_df = sum_df.fillna("DUMMY_NOT_USE").replace("DUMMY_NOT_USE", np.nan)
reindex_subset = sum_df[["GCAS", "Week_num", "Stat_unit_qty"]]
reindex_subset = reindex_subset.dropna()
reindex_subset = reindex_subset.set_index("Week_num")
reindex_subset = (reindex_subset.groupby("GCAS").apply(
lambda x: x.reindex(list(range(reindex_subset.index.min(), reindex_subset.index.max() + 1)), fill_value=0))
.drop("GCAS", axis=1).
reset_index("GCAS").
fillna(0).
reset_index())
reindex_subset = reindex_subset.drop(columns=["Stat_unit_qty"])
final_df = pd.merge(sum_df, reindex_subset, how="outer", on=["GCAS", "Week_num"])
current_date = datetime.now().strftime("%d%m%Y_%H%M%S")
# return sum_df.to_excel(f"CUSTOMER_PORTFOLIO-{current_date}.xlsx", sheet_name="GCAS_SUM", index=False)
return final_df
Code above keeps giving me the following error:
Traceback (most recent call last):
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 3361, in get_loc
return self._engine.get_loc(casted_key)
File "pandas\_libs\index.pyx", line 76, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 103, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 135, in pandas._libs.index.IndexEngine._get_loc_duplicates
File "pandas\_libs\index_class_helper.pxi", line 51, in pandas._libs.index.Float64Engine._maybe_get_bool_indexer
File "pandas\_libs\index.pyx", line 161, in pandas._libs.index.IndexEngine._unpack_bool_indexer
KeyError: 0
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\main.py", line 167, in <module>
sum_customer_portfolio(country=["Croatia", "Slovenia"], sold_to=[2000829798, 2000558171]).to_excel(writer, index=False, sheet_name="GCAS_SUM")
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\main.py", line 113, in sum_customer_portfolio
reindex_subset = (reindex_subset.groupby(["GCAS", "Sold_to"]).apply(
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\groupby\groupby.py", line 1253, in apply
result = self._python_apply_general(f, self._selected_obj)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\groupby\groupby.py", line 1287, in _python_apply_general
keys, values, mutated = self.grouper.apply(f, data, self.axis)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\groupby\ops.py", line 783, in apply
result_values, mutated = splitter.fast_apply(f, sdata, group_keys)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\groupby\ops.py", line 1328, in fast_apply
return libreduction.apply_frame_axis0(sdata, f, names, starts, ends)
File "pandas\_libs\reduction.pyx", line 369, in pandas._libs.reduction.apply_frame_axis0
File "pandas\_libs\reduction.pyx", line 428, in pandas._libs.reduction.BlockSlider.__init__
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\frame.py", line 3430, in __getitem__
indexer = convert_to_index_sliceable(self, key)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexing.py", line 2329, in convert_to_index_sliceable
return idx._convert_slice_indexer(key, kind="getitem")
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\numeric.py", line 242, in _convert_slice_indexer
return self.slice_indexer(key.start, key.stop, key.step, kind=kind)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 5686, in slice_indexer
start_slice, end_slice = self.slice_locs(start, end, step=step)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 5894, in slice_locs
end_slice = self.get_slice_bound(end, "right")
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 5808, in get_slice_bound
raise err
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 5802, in get_slice_bound
slc = self.get_loc(label)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 3363, in get_loc
raise KeyError(key) from err
KeyError: 0
When loading the data directly from Excel (same data that produced by the function), for example, "CUSTOMER_PORTFOLIO-11082021_234057.xlsx" and running the following code:
sum_df = pd.read_excel("CUSTOMER_PORTFOLIO-11082021_234057.xlsx")
reindex_subset = sum_df[["GCAS", "Week_num", "Stat_unit_qty"]]
reindex_subset = reindex_subset.dropna()
reindex_subset = reindex_subset.set_index("Week_num")
reindex_subset = (reindex_subset.groupby("GCAS").apply(
lambda x: x.reindex(list(range(reindex_subset.index.min(), reindex_subset.index.max() + 1)), fill_value=0))
.drop("GCAS", axis=1).
reset_index("GCAS").
fillna(0).
reset_index())
reindex_subset = reindex_subset.drop(columns=["Stat_unit_qty"])
final_df = pd.merge(sum_df, reindex_subset, how="outer", on=["GCAS", "Week_num"])
The code gives me results that I want.
What am I missing? I tried searching for this on SO overflow, but no success as of yet. I have tried resetting index, but unfortunately, it didn't help.
UPDATE: Pasted the full error traceback. Moreover, as I said above, when I run the function without the part of the code that "reindexes" the data, the code works just fine. I have also tried and still no luck:
df_new = df.copy(deep=True)
df_week= df_week.copy(deep=True)
And when I run the "reindex" part of the code on a finished .xlsx, it works just fine, which is strange in itself.

Problems with CSV | Stock Price Manipulation

everyone! I'm going through this course and am having issues. The line I'm having problems with is
df[f'{ticker}_{i}d'] = (df[ticker].shift(-i) - df[ticker]) / df[ticker]
You can find this in the def process_data_for_labels(ticker): function. Can anyone tell me what's going on? I copied his code exactly and am getting the same error.
import bs4 as bs
import requests
import pickle
import datetime as dt
import os
import pandas as pd
import pandas_datareader. data as web
import time
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
from collections import Counter
style.use('dark_background')
def save_sp500_tickers():
resp = requests.get('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
soup = bs.BeautifulSoup(resp.text, 'lxml')
table = soup.find('table', {'class':'wikitable sortable'})
tickers = []
for row in table.findAll('tr')[1:]:
ticker = row.findAll('td')[0].text
tickers.append(ticker.rstrip())
with open("sp500tickers.pickle", "wb") as f:
pickle.dump(tickers, f)
print(tickers)
return tickers
#save_sp500_tickers()
def get_data_from_yahoo(reload_sp500=False):
if reload_sp500:
tickers = save_sp500_tickers()
else:
with open("sp500tickers.pickle", "rb") as f:
tickers = pickle.load(f)
if not os.path.exists('stock_dfs'):
os.makedirs('stock_dfs')
start = dt.datetime(2015, 1, 1)
end = dt.datetime(2020, 7, 1)
for ticker in tickers:
if not os.path.exists('stock_dfs/{ticker}.csv'):
if '.' in ticker:
ticker = ticker.replace('.', '-')
time.sleep(1)
print(ticker)
df = web.DataReader(ticker, 'yahoo', start, end)
df.to_csv('stock_dfs/{}.csv'.format(ticker))
else:
print(f'Already have {ticker}')
#get_data_from_yahoo()
def compile_data():
with open("sp500tickers.pickle","rb") as f:
tickers = pickle.load(f)
main_df = pd.DataFrame()
for count,ticker in enumerate(tickers):
if '.' in ticker:
ticker = ticker.replace('.', '-')
df = pd.read_csv(f'stock_dfs/{ticker}.csv')
df.set_index('Date', inplace=True)
df.rename(columns={'Adj Close':ticker}, inplace=True)
df.drop(['Open','High','Low','Close','Volume'],1,inplace=True)
if main_df.empty:
main_df = df
else:
main_df = main_df.join(df, how='outer')
if count % 10 == 0:
print(count)
print(main_df.head())
main_df.to_csv('sp500_joined_closes.csv')
#compile_data()
def visualize_data():
df = pd.read_csv('sp500_joined_closes.csv')
#df['AAPL'].plot()
#plt.show()
df_corr = df.corr()
print(df_corr.head())
data = df_corr.values
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
heatmap = ax.pcolor(data, cmap=plt.cm.RdYlGn)
fig.colorbar(heatmap)
ax.set_xticks(np.arange(data.shape[0]) + 0.5, minor=False)
ax.set_yticks(np.arange(data.shape[1]) + 0.5, minor=False)
ax.invert_yaxis()
ax.xaxis.tick_top()
column_labels = df_corr.columns
row_labels = df_corr.index
ax.set_xticklabels(column_labels)
ax.set_yticklabels(row_labels)
plt.xticks(rotation=90)
heatmap.set_clim(-1, 1)
plt.tight_layout()
plt.show()
#visualize_data()
# Machine Learning
def process_data_for_labels(ticker):
hm_days = 7
df = pd.read_csv('sp500_joined_closes.csv', index_col=0)
tickers = df.columns.values.tolist()
df.fillna(0,inplace=True)
for i in range(1, hm_days+1):
df[f'{ticker}_{i}d'] = (df[ticker].shift(-i) - df[ticker]) / df[ticker]
df.fillna(0, inplace=True)
return tickers, df
def buy_sell_hold(*args):
cols = [c for c in args]
requirement = 0.2
for col in cols:
if col > requirement:
return 1
if col < -requirement:
return -1
return 0
def extract_featuresets(ticker):
tickers, df = process_data_for_labels(ticker)
end = [eval(f"df[f'{ticker}_{i}']") for i in range(1, 8)]
df[f'{ticker}_target'] = list(map(
buy_sell_hold,
[exec(f"df[f'{ticker}_{i}']") for i in range(1, 8)]
))
vals = df[f'{ticker}_target'].values.tolist()
str_vals = [str(i) for i in vals]
print('Data spread: ', Counter(str_vals))
df.fillna(0, inplace=True)
df = df.replace([np.inf, -np.inf], np.nan)
df.dropna(inplace=True)
df_vals = df[[ticker for ticker in tickers]].pct_change()
df_vals = df_vals.replace([np.inf, -np.inf], 0)
df_vals.fillna(0, inplace=True)
X = df_vals.values
y = df['{ticker}_target'].values
return X,y,df
extract_featuresets('APPL')
Error:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 2646, in get_loc
return self._engine.get_loc(key)
File "pandas\_libs\index.pyx", line 111, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1618, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1626, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'APPL'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "test.py", line 176, in <module>
extract_featuresets('APPL')
File "test.py", line 152, in extract_featuresets
tickers, df = process_data_for_labels(ticker)
File "test.py", line 132, in process_data_for_labels
df[f'{ticker}_{i}d'] = (df[ticker].shift(-i) - df[ticker]) / df[ticker]
File "C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py", line 2800, in __getitem__
indexer = self.columns.get_loc(key)
File "C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 2648, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas\_libs\index.pyx", line 111, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1618, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1626, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'APPL'
You've identified correctly where the problem begins but you need to continue to follow the breadcrumbs.
The error says "KeyError: 'APPL'", where 'APPL' is one of the stock tickers and are column names / keys to your stock price dataframe 'df' (at least the program expects that). However in this instance, 'df' doesn't contain the key / header of 'APPL'. Maybe something went wrong when loading the data where 'pd.read_csv' is used to read the CSV file? Or maybe the file itself is missing data?
Try open a python terminal and simply load the CSV file, is it what you (or the program) would expect?
Keep digging!

Get excel column into a variable

I want to move some xls data into json. I can't just use a ready solution, since this is a bit of a special case.
Here's the excel
Here's the code:
import pandas
xl = pandas.ExcelFile("./data/file.xlsx")
df = xl.parse("2")
x = df["XX"][0]
print(x)
# writing to file
text_file = open("json_files/Output.json", "w")
# text_file.write(json_str)
text_file.close()
Here's the error I'm getting:
Traceback (most recent call last):
File "C:\Users\aironsid\Documents\Capgemini\Excel_to_Json\venv\lib\site-packages\pandas\core\indexes\base.py", line 2646, in get_loc
return self._engine.get_loc(key)
File "pandas\_libs\index.pyx", line 111, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1619, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1627, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'XX'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "excelToJson.py", line 5, in <module>
x = df["XX"][0]
File "C:\Users\aironsid\Documents\Capgemini\Excel_to_Json\venv\lib\site-packages\pandas\core\frame.py", line 2800, in __getitem__
indexer = self.columns.get_loc(key)
File "C:\Users\aironsid\Documents\Capgemini\Excel_to_Json\venv\lib\site-packages\pandas\core\indexes\base.py", line 2648, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas\_libs\index.pyx", line 111, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1619, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1627, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'XX'
It seems to not be able to find the column name.
I'm using this video as reference
import pandas
xl = pandas.ExcelFile("file.xlsx")
# df = xl.parse("Text")
# print(df.columns)
# # x = df["XX"][0]
# # print(x)
df = pandas.Dataframe(xl)
print(df.columns)
# if you can see the columns
print(df["XX"])
# if this is success
dictionary = {"XX": list(df["XX"])}
# writing to file
text_file = open("json_files/Output.json", "w")
# text_file.write(json_str)
text_file.close()
please try this
df = pd.Dataframe(xl)
print(df.columns)
# if you can see the columns
print(df["XX"])
# if this is success
dictionary = {"XX": list(df["XX"])}
As mentioned in comments, you need to translate the starting point of A1 to B7 in your case. This can be achieved with the "skiprows" parameter of pandas.ExcelFile.parse and the index_col parameter:
import pandas
xl = pandas.ExcelFile("path\to\your\file.xlsx")
df = xl.parse("YourSheetName",index_col=1,skiprows=7)
For more documentation/parameters see pandas docs

Getting KeyError using Pandas when accessing .csv files

For some reason pandas is throwing an error when looking through some .csv stock data I have. Here is the error:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 3078, in get_loc
return self._engine.get_loc(key)
File "pandas/_libs/index.pyx", line 140, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 162, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1492, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1500, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Date'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "./python-for-finance-7.py", line 75, in
compile_data()
File "./python-for-finance-7.py", line 59, in compile_data
df.set_index('Date', inplace=True)
File "/usr/local/lib/python3.7/site-packages/pandas/core/frame.py", >line 3909, in set_index
level = frame[col]._values
File "/usr/local/lib/python3.7/site-packages/pandas/core/frame.py", >line 2688, in getitem
return self._getitem_column(key)
File "/usr/local/lib/python3.7/site-packages/pandas/core/frame.py", >line 2695, in _getitem_column
return self._get_item_cache(key)
File "/usr/local/lib/python3.7/site-packages/pandas/core/generic.py", line 2489, in _get_item_cache
values = self._data.get(item)
File "/usr/local/lib/python3.7/site->packages/pandas/core/internals.py", line 4115, in get
loc = self.items.get_loc(item)
File "/usr/local/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 3080, in get_loc> return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 140, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 162, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1492, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1500, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Date'
to this code:
import bs4 as bs
import datetime as dt
import os
import pandas as pd
import pandas_datareader.data as web
import pickle
import requests
def compile_data():
with open("sp500tickers.pickle","rb") as f:
tickers = pickle.load(f)
main_df = pd.DataFrame()
for count,ticker in enumerate(tickers):
df = pd.read_csv('stock_dfs/{}.csv'.format(ticker),
delimiter=',', encoding="utf-8-sig")
df.set_index('Date', inplace=True)
df.rename(columns = {'Adj Close':ticker}, inplace=True)
df.drop(['High','Low','Open','Close','Volume'], 1, inplace=True)
if main_df.empty:
main_df = df
else:
main_df = main_df.join(df, how='outer')
print(count)
print(main_df.head())
main_df.to_csv('sp500_joined_closes.csv')
compile_data()
The data in the CSV files is arranged like this:
Date High Low Open Close Volume Adj. Close
yyyy-mm-dd $$ $$ $$ $$ $$ $$
I tried changing the casing of Date (ie changing Date to date) but it just moves on to throw another
KeyError:"['High', 'Low', 'Open', 'Close', 'Volume'] not found in axis
Can someone please help??
It looks like you're using the wrong delineator. The file is white-space delineated, not comma delineated.
Try using a whitespace delineator:
df = pd.read_csv('stock_dfs/{}.csv'.format(ticker),
delimiter=r'\s+', encoding="utf-8-sig")
In my case, I didn't have any entries when setting the index, the data frame was empty.
It's worth checking
if len(df) > 0:
before setting the index

How to resolve date time error in Pandas code?

I have a csv file that has 7 columns ['Date', 'Time', 'Open', 'High', 'Low', 'Close', 'Volume']
The thing is I tried to set a datetime index but it does not work may be because date and time are two separate columns.
Here is the code:
import pandas as pd
column_names = ['Date', 'Time', 'Open', 'High', 'Low','Close', 'Volume']
df = pd.read_csv(r"E:\Tutorial\EURUSD60.csv", header=None, names=column_names)
df['DateTime'] = pd.to_datetime(df['Date', 'Time'])
print(df.head())
Here is the error:
C:\Users\sydgo\Anaconda3\python.exe E:/Tutorial/language.py Traceback
(most recent call last): File
"C:\Users\sydgo\Anaconda3\lib\site-packages\pandas\core\indexes\base.py",
line 2442, in get_loc
return self._engine.get_loc(key) File "pandas_libs\index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc File
"pandas_libs\index.pyx", line 154, in
pandas._libs.index.IndexEngine.get_loc File
"pandas_libs\hashtable_class_helper.pxi", line 1210, in
pandas._libs.hashtable.PyObjectHashTable.get_item File
"pandas_libs\hashtable_class_helper.pxi", line 1218, in
pandas._libs.hashtable.PyObjectHashTable.get_item KeyError: ('Date',
'Time')
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "E:/Tutorial/language.py",
line 7, in
df['DateTime'] = pd.to_datetime(df['Date', 'Time']) File "C:\Users\sydgo\Anaconda3\lib\site-packages\pandas\core\frame.py",
line 1964, in getitem
return self._getitem_column(key) File "C:\Users\sydgo\Anaconda3\lib\site-packages\pandas\core\frame.py",
line 1971, in _getitem_column
return self._get_item_cache(key) File "C:\Users\sydgo\Anaconda3\lib\site-packages\pandas\core\generic.py",
line 1645, in _get_item_cache
values = self._data.get(item) File "C:\Users\sydgo\Anaconda3\lib\site-packages\pandas\core\internals.py",
line 3590, in get
loc = self.items.get_loc(item) File "C:\Users\sydgo\Anaconda3\lib\site-packages\pandas\core\indexes\base.py",
line 2444, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key)) File "pandas_libs\index.pyx", line 132, in
pandas._libs.index.IndexEngine.get_loc File
"pandas_libs\index.pyx", line 154, in
pandas._libs.index.IndexEngine.get_loc File
"pandas_libs\hashtable_class_helper.pxi", line 1210, in
pandas._libs.hashtable.PyObjectHashTable.get_item File
"pandas_libs\hashtable_class_helper.pxi", line 1218, in
pandas._libs.hashtable.PyObjectHashTable.get_item KeyError: ('Date',
'Time')
If you simplify your code, you'll see the error is right here:
df['Date', 'Time']
That's because you are indexing into the DataFrame once by two strings, but you want to index into it twice, by each of two strings. That is:
df[['Date', 'Time']]
Still, this may fail, because to_datetime expects strings, not pairs of strings:
pd.to_datetime(df['Date', 'Time'])
In which case try this:
pd.to_datetime(df.Date + ' ' + df.Time)

Categories