Handling OutOfBoundsDatetime error in pandas - python

I am trying to apply few functions on a pandas data frame, but am getting OutOfBoundsDatetime error -
def data_cleanser(data):
clean_df = data.str.strip().str.replace(r'\\', '')
return clean_df
connection = pyodbc.connect(conn)
sql = 'SELECT * FROM {}'.format(tablename)
df = pd.read_sql_query(sql, connection)
df.replace([None], np.nan, inplace=True)
df.fillna('', inplace=True)
df=df.applymap(str)
df = df.apply(lambda x: data_cleanser(x))
Error message:
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Python_Scripts/lib64/python3.4/site-packages/pandas/core/internals.py", line 763, in replace
copy=not inplace) for b in blocks]
File "/Python_Scripts/lib64/python3.4/site-packages/pandas/core/internals.py", line 763, in <listcomp>
copy=not inplace) for b in blocks]
File "/Python_Scripts/lib64/python3.4/site-packages/pandas/core/internals.py", line 2135, in convert
blocks = self.split_and_operate(None, f, False)
File "/Python_Scripts/lib64/python3.4/site-packages/pandas/core/internals.py", line 478, in split_and_operate
nv = f(m, v, i)
File "/Python_Scripts/lib64/python3.4/site-packages/pandas/core/internals.py", line 2125, in f
values = fn(v.ravel(), **fn_kwargs)
File "/Python_Scripts/lib64/python3.4/site-packages/pandas/core/dtypes/cast.py", line 807, in soft_convert_objects
values = lib.maybe_convert_objects(values, convert_datetime=datetime)
File "pandas/_libs/src/inference.pyx", line 1290, in pandas._libs.lib.maybe_convert_objects
File "pandas/_libs/tslib.pyx", line 1575, in pandas._libs.tslib.convert_to_tsobject
File "pandas/_libs/tslib.pyx", line 1669, in pandas._libs.tslib.convert_datetime_to_tsobject
File "pandas/_libs/tslib.pyx", line 1848, in pandas._libs.tslib._check_dts_bounds
pandas._libs.tslib.OutOfBoundsDatetime: Out of bounds nanosecond timestamp: 1-01-01 07:00:00
Sample data frame:
appointment_id
start_time
end_time
emp_id
302205
2016-10-26 17:30:00
2016-10-26 18:30:00
45807
462501
2017-04-10 13:00:00
NaT
45807
How can i avoid this error?

Related

KeyError when using set_index on Dataframe created from read_sql_query

With the following code, I can see the following table output
query = f''' SELECT timestamp, base FROM pricing WHERE {timestamp_range} LIMIT {max_records}'''
df: DataFrame = read_sql_query(text(query), db)
print(df.head())
print(df.columns)
df.set_index('timestamp', inplace=True) # Error here
# Output
# timestamp base
# 0 2023-02-17 10:25:54.099542 21
# 1 2023-02-17 10:27:54.060627 21
# 2 2023-02-17 10:29:53.581384 22
# 3 2023-02-17 10:31:54.110646 20
# 4 2023-02-17 10:33:53.827830 20
# Index(['timestamp', 'base'], dtype='object')
So it looks like I do have the timestamp column, but when using set_index() I get a KeyError: 'timestamp'. Why is this? Using df.columns[0] didn't help either.
For reference, full stack trace log
[2023-02-17 13:50:32,388] ERROR in app: Exception on /api/data/query [GET]
Traceback (most recent call last):
File "/home/ubuntu/.local/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3802, in get_loc
return self._engine.get_loc(casted_key)
File "pandas/_libs/index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 165, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 5745, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 5753, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'timestamp'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/ubuntu/.local/lib/python3.10/site-packages/flask/app.py", line 2525, in wsgi_app
response = self.full_dispatch_request()
File "/home/ubuntu/.local/lib/python3.10/site-packages/flask/app.py", line 1822, in full_dispatch_request
rv = self.handle_user_exception(e)
File "/home/ubuntu/.local/lib/python3.10/site-packages/flask/app.py", line 1820, in full_dispatch_request
rv = self.dispatch_request()
File "/home/ubuntu/.local/lib/python3.10/site-packages/flask/app.py", line 1796, in dispatch_request
return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
File "/home/ubuntu/Code/grozd/main.py", line 18, in query
result = getQuery(request.args)
File "/home/ubuntu/Code/grozd/data.py", line 27, in getQuery
df['timestamp'] = df['timestamp'].astype(int).floordiv(1000000).astype(int)
File "/home/ubuntu/.local/lib/python3.10/site-packages/pandas/core/frame.py", line 3807, in __getitem__
indexer = self.columns.get_loc(key)
File "/home/ubuntu/.local/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3804, in get_loc
raise KeyError(key) from err
KeyError: 'timestamp'
From the traceback it looks like the error does not actually occur in the line you highlighted, but later on when you do this:
df['timestamp'] = df['timestamp'].astype(int).floordiv(1000000).astype(int)
The reason this fails is that once you've assigned the timestamp column to the index, it is no longer found among the columns. To fix this you can either perform your transformation first and then set the index, or transform the index directly:
df.index = df.index.astype(int).floordiv(1000000).astype(int)

Why does pandas market calendar give "ValueError: The dtype of 'values' is incorrect. Must be 'datetime64[ns]'. Got object instead."?

The pandas-market-calendars module documentation gives a quickstart example of how to create a schedule here: https://pypi.org/project/pandas-market-calendars/
I copy-pasted their quickstart example:
import pandas_market_calendars as mcal
# Create a calendar
nyse = mcal.get_calendar('NYSE')
# Show available calendars
print(mcal.get_calendar_names())
early = nyse.schedule(start_date='2012-07-01', end_date='2012-07-10')
print(early)
In their example, we should get a nice pretty schedule output:
market_open market_close
=========== ========================= =========================
2012-07-02 2012-07-02 13:30:00+00:00 2012-07-02 20:00:00+00:00
2012-07-03 2012-07-03 13:30:00+00:00 2012-07-03 17:00:00+00:00
2012-07-05 2012-07-05 13:30:00+00:00 2012-07-05 20:00:00+00:00
2012-07-06 2012-07-06 13:30:00+00:00 2012-07-06 20:00:00+00:00
2012-07-09 2012-07-09 13:30:00+00:00 2012-07-09 20:00:00+00:00
2012-07-10 2012-07-10 13:30:00+00:00 2012-07-10 20:00:00+00:00
instead, I get an error:
ValueError: The dtype of 'values' is incorrect. Must be 'datetime64[ns]'. Got object instead.
What am I doing wrong? I thought I input the dates exactly like the example showed. Why is it requiring a different format? Some people have ran the above code without error. My traceback is:
Traceback (most recent call last):
File "D:/Strawberry/Strawberry_Omega/tests/pandas_mkt_clndr_test.py", line 9, in <module>
early = nyse.schedule(start_date='2012-07-01', end_date='2012-07-10')
File "C:\Users\Wes\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas_market_calendars\market_calendar.py", line 632, in schedule
adjusted = schedule.loc[_close_adj].apply(
File "C:\Users\Wes\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\frame.py", line 7547, in apply
return op.get_result()
File "C:\Users\Wes\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\apply.py", line 180, in get_result
return self.apply_standard()
File "C:\Users\Wes\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\apply.py", line 255, in apply_standard
results, res_index = self.apply_series_generator()
File "C:\Users\Wes\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\apply.py", line 284, in apply_series_generator
results[i] = self.f(v)
File "C:\Users\Wes\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas_market_calendars\market_calendar.py", line 633, in <lambda>
lambda x: x.where(x.le(x["market_close"]), x["market_close"]), axis= 1)
File "C:\Users\Wes\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\generic.py", line 9001, in where
return self._where(
File "C:\Users\Wes\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\generic.py", line 8741, in _where
cond, _ = cond.align(self, join="right", broadcast_axis=1)
File "C:\Users\Wes\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\series.py", line 4274, in align
return super().align(
File "C:\Users\Wes\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\generic.py", line 8556, in align
return self._align_series(
File "C:\Users\Wes\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\generic.py", line 8664, in _align_series
right = other._reindex_indexer(join_index, ridx, copy)
File "C:\Users\Wes\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\series.py", line 4241, in _reindex_indexer
return self.copy()
File "C:\Users\Wes\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\generic.py", line 5660, in copy
data = self._mgr.copy(deep=deep)
File "C:\Users\Wes\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\internals\managers.py", line 802, in copy
res = self.apply("copy", deep=deep)
File "C:\Users\Wes\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\internals\managers.py", line 406, in apply
applied = getattr(b, f)(**kwargs)
File "C:\Users\Wes\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\internals\blocks.py", line 679, in copy
return self.make_block_same_class(values, ndim=self.ndim)
File "C:\Users\Wes\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\internals\blocks.py", line 261, in make_block_same_class
return type(self)(values, placement=placement, ndim=ndim)
File "C:\Users\Wes\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\internals\blocks.py", line 1544, in __init__
values = self._maybe_coerce_values(values)
File "C:\Users\Wes\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\internals\blocks.py", line 2170, in _maybe_coerce_values
values = self._holder(values)
File "C:\Users\Wes\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\arrays\datetimes.py", line 254, in __init__
raise ValueError(
ValueError: The dtype of 'values' is incorrect. Must be 'datetime64[ns]'. Got object instead.

Python Pandas convert date to epoch timestamp

From CSV file, i'm trying with pandas to convert a date column to epoch timestamp as follow, but i got some errors:
csv:
<<Electric power and temperature Information>>
Date,Electric power average,Electric power maximum value,Electric power minimum value,...,...
2021/12/02 00:00:00,1524,1553,1506,22,22,22,,,,,,,21,21,21,,,,,,,,,,,,,,,,,,,,,,,,
2021/12/01 22:00:00,1521,1547,1468,22,22,22,,,,,,,21,21,21,,,,,,,,,,,,,,,,,,,,,,,,
2021/12/01 20:00:00,1546,1613,1524,22,22,22,,,,,,,21,21,21,,,,,,,,,,,,,,,,,,,,,,,,
2021/12/01 18:00:00,1553,1595,1525,22,22,22,,,,,,,21,21,21,,,,,,,,,,,,,,,,,,,,,,,,
2021/12/01 16:00:00,1541,1593,1520,22,22,22,,,,,,,21,21,21,,,,,,,,,,,,,,,,,,,,,,,,
2021/12/01 14:00:00,1540,1580,1514,22,22,22,,,,,,,21,21,21,,,,,,,,,,,,,,,,,,,,,,,,
code:
csv_envfile = csvfile.csv
df = pd.read_csv(csv_envfile[0], skiprows=[0])
date_pattern='%Y/%m/%d %H:%M:%S '
df['epoch'] = df.apply(lambda row: int(time.mktime(time.strptime(row.time,date_pattern))), axis=0) # create epoch as a column
print("epoch:",df['epoch'])
error:
Traceback (most recent call last):
File "./02-pickle-client.py", line 622, in <module>
main()
File "./02-pickle-client.py", line 576, in main
execute_run_csv_environnement(confcsv_path, storage_type, serial)
File "./02-pickle-client.py", line 434, in execute_run_csv_environnement
run_csv_environnement(sock, delay, csvfile, storage_type, serial)
File "./02-pickle-client.py", line 402, in run_csv_environnement
df['epoch'] = df.apply(lambda row: int(time.mktime(time.strptime(row.time,date_pattern))), axis=0) # create epoch as a column
File "/usr/local/lib64/python3.6/site-packages/pandas/core/frame.py", line 7552, in apply
return op.get_result()
File "/usr/local/lib64/python3.6/site-packages/pandas/core/apply.py", line 185, in get_result
return self.apply_standard()
File "/usr/local/lib64/python3.6/site-packages/pandas/core/apply.py", line 276, in apply_standard
results, res_index = self.apply_series_generator()
File "/usr/local/lib64/python3.6/site-packages/pandas/core/apply.py", line 305, in apply_series_generator
results[i] = self.f(v)
File "./02-pickle-client.py", line 402, in <lambda>
df['epoch'] = df.apply(lambda row: int(time.mktime(time.strptime(row.time,date_pattern))), axis=0) # create epoch as a column
File "/usr/local/lib64/python3.6/site-packages/pandas/core/generic.py", line 5141, in __getattr__
return object.__getattribute__(self, name)
AttributeError: 'Series' object has no attribute 'time'
Many thanks for help
You should select the Date column when applying the lambda function. In your case this should work:
import pandas as pd
import time
csv_envfile = csvfile.csv
df = pd.read_csv(csv_envfile[0], skiprows=[0])
date_pattern='%Y/%m/%d %H:%M:%S'
df['epoch'] = df["Date"].apply(lambda row: int(time.mktime(time.strptime(row,date_pattern))))

KeyError: 0 when used in existing function, otherwise the code works fine

I want to do the following:
I have data in long format organized by dates
Sometimes, data is missing as it there is no record of it
I found a solution by interpolating missing data using reindex which works fine when used outside of function, but for some reason, doesn't work when used inside of a function
def sum_customer_portfolio(country, sold_to):
df = pd.merge(etl_customer_portfolio(), etl_week(), how="left", on=["Country", "GCAS"])
df = df.loc[df["Country"].isin(country)]
df = df.loc[df["Sold_to"].isin(sold_to)]
df_week = etl_week()
df_week = df_week.dropna(subset=["Sold_to"])
df_week = df_week[["Week_num", "Date_range"]]
df_week = df_week.drop_duplicates(subset=["Date_range"])
sum_df = pd.merge(df, df_week, how="outer", on=["Week_num", "Date_range"])
sum_df["Stat_unit_qty"] = sum_df["Stat_unit_qty"].fillna(0, axis=0)
sum_df[["Country", "Sold_to", "Customer"]] = sum_df[["Country", "Sold_to", "Customer"]].fillna(method="ffill",
axis=0)
sum_df = sum_df.fillna("DUMMY_NOT_USE").replace("DUMMY_NOT_USE", np.nan)
reindex_subset = sum_df[["GCAS", "Week_num", "Stat_unit_qty"]]
reindex_subset = reindex_subset.dropna()
reindex_subset = reindex_subset.set_index("Week_num")
reindex_subset = (reindex_subset.groupby("GCAS").apply(
lambda x: x.reindex(list(range(reindex_subset.index.min(), reindex_subset.index.max() + 1)), fill_value=0))
.drop("GCAS", axis=1).
reset_index("GCAS").
fillna(0).
reset_index())
reindex_subset = reindex_subset.drop(columns=["Stat_unit_qty"])
final_df = pd.merge(sum_df, reindex_subset, how="outer", on=["GCAS", "Week_num"])
current_date = datetime.now().strftime("%d%m%Y_%H%M%S")
# return sum_df.to_excel(f"CUSTOMER_PORTFOLIO-{current_date}.xlsx", sheet_name="GCAS_SUM", index=False)
return final_df
Code above keeps giving me the following error:
Traceback (most recent call last):
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 3361, in get_loc
return self._engine.get_loc(casted_key)
File "pandas\_libs\index.pyx", line 76, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 103, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 135, in pandas._libs.index.IndexEngine._get_loc_duplicates
File "pandas\_libs\index_class_helper.pxi", line 51, in pandas._libs.index.Float64Engine._maybe_get_bool_indexer
File "pandas\_libs\index.pyx", line 161, in pandas._libs.index.IndexEngine._unpack_bool_indexer
KeyError: 0
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\main.py", line 167, in <module>
sum_customer_portfolio(country=["Croatia", "Slovenia"], sold_to=[2000829798, 2000558171]).to_excel(writer, index=False, sheet_name="GCAS_SUM")
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\main.py", line 113, in sum_customer_portfolio
reindex_subset = (reindex_subset.groupby(["GCAS", "Sold_to"]).apply(
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\groupby\groupby.py", line 1253, in apply
result = self._python_apply_general(f, self._selected_obj)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\groupby\groupby.py", line 1287, in _python_apply_general
keys, values, mutated = self.grouper.apply(f, data, self.axis)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\groupby\ops.py", line 783, in apply
result_values, mutated = splitter.fast_apply(f, sdata, group_keys)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\groupby\ops.py", line 1328, in fast_apply
return libreduction.apply_frame_axis0(sdata, f, names, starts, ends)
File "pandas\_libs\reduction.pyx", line 369, in pandas._libs.reduction.apply_frame_axis0
File "pandas\_libs\reduction.pyx", line 428, in pandas._libs.reduction.BlockSlider.__init__
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\frame.py", line 3430, in __getitem__
indexer = convert_to_index_sliceable(self, key)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexing.py", line 2329, in convert_to_index_sliceable
return idx._convert_slice_indexer(key, kind="getitem")
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\numeric.py", line 242, in _convert_slice_indexer
return self.slice_indexer(key.start, key.stop, key.step, kind=kind)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 5686, in slice_indexer
start_slice, end_slice = self.slice_locs(start, end, step=step)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 5894, in slice_locs
end_slice = self.get_slice_bound(end, "right")
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 5808, in get_slice_bound
raise err
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 5802, in get_slice_bound
slc = self.get_loc(label)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 3363, in get_loc
raise KeyError(key) from err
KeyError: 0
When loading the data directly from Excel (same data that produced by the function), for example, "CUSTOMER_PORTFOLIO-11082021_234057.xlsx" and running the following code:
sum_df = pd.read_excel("CUSTOMER_PORTFOLIO-11082021_234057.xlsx")
reindex_subset = sum_df[["GCAS", "Week_num", "Stat_unit_qty"]]
reindex_subset = reindex_subset.dropna()
reindex_subset = reindex_subset.set_index("Week_num")
reindex_subset = (reindex_subset.groupby("GCAS").apply(
lambda x: x.reindex(list(range(reindex_subset.index.min(), reindex_subset.index.max() + 1)), fill_value=0))
.drop("GCAS", axis=1).
reset_index("GCAS").
fillna(0).
reset_index())
reindex_subset = reindex_subset.drop(columns=["Stat_unit_qty"])
final_df = pd.merge(sum_df, reindex_subset, how="outer", on=["GCAS", "Week_num"])
The code gives me results that I want.
What am I missing? I tried searching for this on SO overflow, but no success as of yet. I have tried resetting index, but unfortunately, it didn't help.
UPDATE: Pasted the full error traceback. Moreover, as I said above, when I run the function without the part of the code that "reindexes" the data, the code works just fine. I have also tried and still no luck:
df_new = df.copy(deep=True)
df_week= df_week.copy(deep=True)
And when I run the "reindex" part of the code on a finished .xlsx, it works just fine, which is strange in itself.

Apply a function from a groupby transform

My pandas looks like this
Date Ticker Open High Low Adj Close Adj_Close Volume
2016-04-18 vws.co 445.0 449.2 441.7 447.3 447.3 945300
2016-04-19 vws.co 449.0 455.8 448.3 450.9 450.9 907700
2016-04-20 vws.co 451.0 452.5 435.4 436.6 436.6 1268100
2016-04-21 vws.co 440.1 442.9 428.4 435.5 435.5 1308300
2016-04-22 vws.co 435.5 435.5 435.5 435.5 435.5 0
2016-04-25 vws.co 431.0 436.7 424.4 430.0 430.0 1311700
2016-04-18 nflx 109.9 110.7 106.02 108.4 108.4 27001500
2016-04-19 nflx 99.49 101.37 94.2 94.34 94.34 55623900
2016-04-20 nflx 94.34 96.98 93.14 96.77 96.77 25633600
2016-04-21 nflx 97.31 97.38 94.78 94.98 94.98 19859400
2016-04-22 nflx 94.85 96.69 94.21 95.9 95.9 15786000
2016-04-25 nflx 95.7 95.75 92.8 93.56 93.56 14965500
I have a program that at one of the functions with embedded functions sucessfully runs a groupby.
This line looks like this
df['MA3'] = df.groupby('Ticker').Adj_Close.transform(lambda group: pd.rolling_mean(group, window=3))
Se my initial question and the data-format here:
Select only one value in df col rows in same df for calc results from different val, and calc df only on one ticker at a time
It has now dawned on me that rather than doing the groupby in each embedded function of which I have 5, I would rather have the groupby run in the main program calling the top function, so all the embedded functions could work on the filtered groupby pandas dataframe from only doing the groupby once...
How do I apply my main function with groupby, in order to filter my pandas, so I only work on one ticker (value in col 'Ticker') at a time?
The 'Ticker' col contains 'aapl', 'msft', 'nflx' company identifyers etc, with timeseries data for a time-window.
Thanks a lot Karasinski. This is close to what I want. But I get an errror.
When I run:
def Screener(df_all, group):
# Copy df_all to df for single ticker operations
df = df_all.copy()
def diff_calc(df,ticker):
df['Difference'] = df['Adj_Close'].diff()
return df
df = diff_calc(df, ticker)
return df_all
for ticker in stocklist:
df_all[['Difference']] = df_all.groupby('Ticker').Adj_Close.apply(Screener, ticker)
I get this error:
Traceback (most recent call last):
File "<ipython-input-2-d7c1835f6b2a>", line 1, in <module>
runfile('C:/Users/Morten/Documents/Design/Python/CrystalBall - Local - Git/Git - CrystalBall/sandbox/screener_test simple for StockOverflowNestedFct_Getstock.py', wdir='C:/Users/Morten/Documents/Design/Python/CrystalBall - Local - Git/Git - CrystalBall/sandbox')
File "C:\Program Files\WinPython-64bit-3.3.5.7\python-3.3.5.amd64\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 682, in runfile
execfile(filename, namespace)
File "C:\Program Files\WinPython-64bit-3.3.5.7\python-3.3.5.amd64\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 85, in execfile
exec(compile(open(filename, 'rb').read(), filename, 'exec'), namespace)
File "C:/Users/Morten/Documents/Design/Python/CrystalBall - Local - Git/Git - CrystalBall/sandbox/screener_test simple for StockOverflowNestedFct_Getstock.py", line 144, in <module>
df_all[['Difference']] = df_all.groupby('Ticker').Adj_Close.apply(Screener, ticker)
File "C:\Program Files\WinPython-64bit-3.3.5.7\python-3.3.5.amd64\lib\site-packages\pandas\core\groupby.py", line 663, in apply
return self._python_apply_general(f)
File "C:\Program Files\WinPython-64bit-3.3.5.7\python-3.3.5.amd64\lib\site-packages\pandas\core\groupby.py", line 667, in _python_apply_general
self.axis)
File "C:\Program Files\WinPython-64bit-3.3.5.7\python-3.3.5.amd64\lib\site-packages\pandas\core\groupby.py", line 1286, in apply
res = f(group)
File "C:\Program Files\WinPython-64bit-3.3.5.7\python-3.3.5.amd64\lib\site-packages\pandas\core\groupby.py", line 659, in f
return func(g, *args, **kwargs)
File "C:/Users/Morten/Documents/Design/Python/CrystalBall - Local - Git/Git - CrystalBall/sandbox/screener_test simple for StockOverflowNestedFct_Getstock.py", line 112, in Screener
df = diff_calc(df, ticker)
File "C:/Users/Morten/Documents/Design/Python/CrystalBall - Local - Git/Git - CrystalBall/sandbox/screener_test simple for StockOverflowNestedFct_Getstock.py", line 70, in diff_calc
df['Difference'] = df['Adj_Close'].diff()
File "C:\Program Files\WinPython-64bit-3.3.5.7\python-3.3.5.amd64\lib\site-packages\pandas\core\series.py", line 514, in __getitem__
result = self.index.get_value(self, key)
File "C:\Program Files\WinPython-64bit-3.3.5.7\python-3.3.5.amd64\lib\site-packages\pandas\tseries\index.py", line 1221, in get_value
raise KeyError(key)
KeyError: 'Adj_Close'
And when I use functools like so
df_all = functools.partial(df_all.groupby('Ticker').Adj_Close.apply(Screener, ticker))
I get the same error as above...
Traceback (most recent call last):
File "<ipython-input-5-d7c1835f6b2a>", line 1, in <module>
runfile('C:/Users/Morten/Documents/Design/Python/CrystalBall - Local - Git/Git - CrystalBall/sandbox/screener_test simple for StockOverflowNestedFct_Getstock.py', wdir='C:/Users/Morten/Documents/Design/Python/CrystalBall - Local - Git/Git - CrystalBall/sandbox')
File "C:\Program Files\WinPython-64bit-3.3.5.7\python-3.3.5.amd64\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 682, in runfile
execfile(filename, namespace)
File "C:\Program Files\WinPython-64bit-3.3.5.7\python-3.3.5.amd64\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 85, in execfile
exec(compile(open(filename, 'rb').read(), filename, 'exec'), namespace)
File "C:/Users/Morten/Documents/Design/Python/CrystalBall - Local - Git/Git - CrystalBall/sandbox/screener_test simple for StockOverflowNestedFct_Getstock.py", line 148, in <module>
df_all = functools.partial(df_all.groupby('Ticker').Adj_Close.apply(Screener, [ticker]))
File "C:\Program Files\WinPython-64bit-3.3.5.7\python-3.3.5.amd64\lib\site-packages\pandas\core\groupby.py", line 663, in apply
return self._python_apply_general(f)
File "C:\Program Files\WinPython-64bit-3.3.5.7\python-3.3.5.amd64\lib\site-packages\pandas\core\groupby.py", line 667, in _python_apply_general
self.axis)
File "C:\Program Files\WinPython-64bit-3.3.5.7\python-3.3.5.amd64\lib\site-packages\pandas\core\groupby.py", line 1286, in apply
res = f(group)
File "C:\Program Files\WinPython-64bit-3.3.5.7\python-3.3.5.amd64\lib\site-packages\pandas\core\groupby.py", line 659, in f
return func(g, *args, **kwargs)
File "C:/Users/Morten/Documents/Design/Python/CrystalBall - Local - Git/Git - CrystalBall/sandbox/screener_test simple for StockOverflowNestedFct_Getstock.py", line 114, in Screener
df = diff_calc(df, ticker)
File "C:/Users/Morten/Documents/Design/Python/CrystalBall - Local - Git/Git - CrystalBall/sandbox/screener_test simple for StockOverflowNestedFct_Getstock.py", line 72, in diff_calc
df['Difference'] = df['Adj_Close'].diff()
File "C:\Program Files\WinPython-64bit-3.3.5.7\python-3.3.5.amd64\lib\site-packages\pandas\core\series.py", line 514, in __getitem__
result = self.index.get_value(self, key)
File "C:\Program Files\WinPython-64bit-3.3.5.7\python-
3.3.5.amd64\lib\site-packages\pandas\tseries\index.py", line 1221, in get_value
raise KeyError(key)
KeyError: 'Adj_Close'
Edit from Karasinski's edit from 31/5.
When I run the last suggestion from Karasinski I get this error.
mmm
mmm
nflx
vws.co
Traceback (most recent call last):
File "<ipython-input-4-d7c1835f6b2a>", line 1, in <module>
runfile('C:/Users/Morten/Documents/Design/Python/CrystalBall - Local - Git/Git - CrystalBall/sandbox/screener_test simple for StockOverflowNestedFct_Getstock.py', wdir='C:/Users/Morten/Documents/Design/Python/CrystalBall - Local - Git/Git - CrystalBall/sandbox')
File "C:\Program Files\WinPython-64bit-3.3.5.7\python-3.3.5.amd64\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 682, in runfile
execfile(filename, namespace)
File "C:\Program Files\WinPython-64bit-3.3.5.7\python-3.3.5.amd64\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 85, in execfile
exec(compile(open(filename, 'rb').read(), filename, 'exec'), namespace)
File "C:/Users/Morten/Documents/Design/Python/CrystalBall - Local - Git/Git - CrystalBall/sandbox/screener_test simple for StockOverflowNestedFct_Getstock.py", line 173, in <module>
df_all[['mean', 'max', 'median', 'min']] = df_all.groupby('Ticker').apply(group_func)
File "C:\Program Files\WinPython-64bit-3.3.5.7\python-3.3.5.amd64\lib\site-packages\pandas\core\groupby.py", line 663, in apply
return self._python_apply_general(f)
File "C:\Program Files\WinPython-64bit-3.3.5.7\python-3.3.5.amd64\lib\site-packages\pandas\core\groupby.py", line 670, in _python_apply_general
not_indexed_same=mutated)
File "C:\Program Files\WinPython-64bit-3.3.5.7\python-3.3.5.amd64\lib\site-packages\pandas\core\groupby.py", line 2785, in _wrap_applied_output
not_indexed_same=not_indexed_same)
File "C:\Program Files\WinPython-64bit-3.3.5.7\python-3.3.5.amd64\lib\site-packages\pandas\core\groupby.py", line 1142, in _concat_objects
result = result.reindex_axis(ax, axis=self.axis)
File "C:\Program Files\WinPython-64bit-3.3.5.7\python-3.3.5.amd64\lib\site-packages\pandas\core\frame.py", line 2508, in reindex_axis
fill_value=fill_value)
File "C:\Program Files\WinPython-64bit-3.3.5.7\python-3.3.5.amd64\lib\site-packages\pandas\core\generic.py", line 1841, in reindex_axis
{axis: [new_index, indexer]}, fill_value=fill_value, copy=copy)
File "C:\Program Files\WinPython-64bit-3.3.5.7\python-3.3.5.amd64\lib\site-packages\pandas\core\generic.py", line 1865, in _reindex_with_indexers
copy=copy)
File "C:\Program Files\WinPython-64bit-3.3.5.7\python-3.3.5.amd64\lib\site-packages\pandas\core\internals.py", line 3144, in reindex_indexer
raise ValueError("cannot reindex from a duplicate axis")
ValueError: cannot reindex from a duplicate axis
From an answer from your previous question we can set up with
import pandas as pd
from StringIO import StringIO
text = """Date Ticker Open High Low Adj_Close Volume
2015-04-09 vws.co 315.000000 316.100000 312.500000 311.520000 1686800
2015-04-10 vws.co 317.000000 319.700000 316.400000 312.700000 1396500
2015-04-13 vws.co 317.900000 321.500000 315.200000 315.850000 1564500
2015-04-14 vws.co 320.000000 322.400000 318.700000 314.870000 1370600
2015-04-15 vws.co 320.000000 321.500000 319.200000 316.150000 945000
2015-04-16 vws.co 319.000000 320.200000 310.400000 307.870000 2236100
2015-04-17 vws.co 309.900000 310.000000 302.500000 299.100000 2711900
2015-04-20 vws.co 303.000000 312.000000 303.000000 306.490000 1629700
2016-03-31 mmm 166.750000 167.500000 166.500000 166.630005 1762800
2016-04-01 mmm 165.630005 167.740005 164.789993 167.529999 1993700
2016-04-04 mmm 167.110001 167.490005 165.919998 166.399994 2022800
2016-04-05 mmm 165.179993 166.550003 164.649994 165.809998 1610300
2016-04-06 mmm 165.339996 167.080002 164.839996 166.809998 2092200
2016-04-07 mmm 165.880005 167.229996 165.250000 167.160004 2721900"""
df = pd.read_csv(StringIO(text), delim_whitespace=1, parse_dates=[0], index_col=0)
You can then make a function which calculates whatever statistics you'd like, such as:
def various_indicators(group):
mean = pd.rolling_mean(group, window=3)
max = pd.rolling_max(group, window=3)
median = pd.rolling_median(group, window=3)
min = pd.rolling_min(group, window=3)
return pd.DataFrame({'mean': mean,
'max': max,
'median': median,
'min': min})
To assign these new columns to your dataframe, you would then do a groupby and then apply the function by
df[['mean', 'max', 'median', 'min']] = df.groupby('Ticker').Adj_Close.apply(various_indicators)
EDIT
In regards to your further questions in the comments of the answer: To extract additional information from the dataframe, you should instead pass the entire group rather than just the single column.
def group_func(group):
ticker = group.Ticker.unique()[0]
adj_close = group.Adj_Close
return Screener(ticker, adj_close)
def Screener(ticker, adj_close):
print(ticker)
mean = pd.rolling_mean(adj_close, window=3)
max = pd.rolling_max(adj_close, window=3)
median = pd.rolling_median(adj_close, window=3)
min = pd.rolling_min(adj_close, window=3)
return pd.DataFrame({'mean': mean,
'max': max,
'median': median,
'min': min})
You can then assign these columns in a similar way as above
df[['mean', 'max', 'median', 'min']] = df.groupby('Ticker').apply(group_func)

Categories