Calculating ADX indicator with pandas DF / Smoothing average issues - python

I've been having issues calculating ADX indicator with pandas DF in Python. Been racking my brains the past few days regarding what is going wrong. My thinking is something to do with smoothing average maybe? the last couple of lines from the results are: The ADX for the last row '2021-07-03' should be around 33.
i. date. open. high. low. close. +DI. -DM EMa. -DI. DX. ADX.
396 2021-06-30 35894.90 36088.87 34013.34 35036.58 ... 0.629251 -132.284380 -4.690877 1.309852 5.418229
397 2021-07-01 35048.78 35048.78 32720.03 33506.98 ... -4.476247 57.794871 2.098175 2.764602 5.479026
398 2021-07-02 33502.26 33971.12 32698.40 33796.99 ... -9.798669 52.972888 2.071717 1.536231 5.483165
399 2021-07-03 33831.62 34807.59 33322.38 34632.63 ... -4.374875 -37.287497 -1.544599 0.478130 5.475540
My code below:
def adx_calc(df):
for current in range(1, len(df.index)):
previous = current - 1
#True Range:
tr = max((df.loc[current,'high'] - df.loc[current,'low']), (df.loc[current,'high'] - df.loc[previous,'close']), abs((df.loc[previous, 'close'] - df.loc[current,'low'])))
df.loc[current, 'TR'] = tr
df['ATR'] = df['TR'].ewm(span = 14).mean()
# DM's:
df.loc[current, '+DM'] = df.loc[current, 'high'] - df.loc[previous, 'high']
df.loc[current, '-DM'] = df.loc[previous, 'low'] - df.loc[current, 'low']
# + DI:
df['+DM_EMA'] = pd.DataFrame.ewm(df['+DM'], span = adx_time_period).mean()
df.loc[current, '+DI'] = ((df.loc[current, '+DM_EMA'] / df.loc[current, 'ATR']) * 100)
# - DI:
df['-DM_EMA'] = pd.DataFrame.ewm(df['-DM'], span = adx_time_period).mean()
df.loc[current, '-DI'] = ((df.loc[current, '-DM_EMA'] / df.loc[current, 'ATR']) * 100)
# # DX:
df.loc[current, 'DX'] = (abs((df.loc[current, '+DI'] - df.loc[current, '-DI'])) / abs((df.loc[current, '+DI'] + df.loc[current, '-DI'])))
# # ADX:
df['ADX'] = df['DX'].rolling(window=14).mean()```

Related

Python - While trying to calculate RSI(Relative strength index - stock indicator) my results are "upside down" and shifted

I am trying to calculate RSI using simple functions.
The general formula for it is:
RSI = 100/(1+RS), where RS = Exponential Moving Average of gains / -||- of losses.
Here is what I am getting:
enter image description here
Here it is how should it look like:
enter image description here
I have everything double checked or even triple checked, but I can't find any mistake.
Thus I need your help, I know that the question is very simple though I need some help, I have no idea where I have made the mistake.
The general idea of RSI is that it should be low where the price is "low" and high, where the price is high, and generally no matter what I try I have it upside down.
def EMA(close_price_arr, n):
a = (2/n + 1)
EMA_n = np.empty((1, len(close_price_arr)))
for i in range(len(close_price_arr)):
if i < n:
# creating NaN values where it is impossible to calculate EMA to drop it later after connecting the whole database
EMA_n[0, i] = 'NaN'
if i >= n:
# Calaculating nominator and denominator of EMA
for j in range(n):
nominator_ema += close_price_arr[i - j] * a**(j)
denominator_ema += a**(j)
EMA_n[0, i] = nominator_ema / denominator_ema
nominator_ema = 0
denominator_ema = 0
return EMA_n
def gains(close_price_arr):
gain_arr = np.empty((len(close_price_arr) - 1))
for i in range(len(close_price_arr)):
if i == 0:
pass
if i >= 1:
if close_price_arr[i] > close_price_arr[i - 1]:
gain_arr[i - 1] = (close_price_arr[i] - close_price_arr[i-1])
else:
gain_arr[i - 1] = 0
return gain_arr
def losses(close_price_arr):
loss_arr = np.empty((len(close_price_arr) - 1))
for i in range(len(close_price_arr)):
if i == 0:
pass
if i >= 1:
if close_price_arr[i] < close_price_arr[i - 1]:
loss_arr[i - 1] = abs(close_price_arr[i] - close_price_arr[i - 1])
else:
loss_arr[i - 1] = 0
return loss_arr
def RSI(gain_arr, loss_arr, n):
EMA_u = EMA(gain_arr, n)
EMA_d = EMA(loss_arr, n)
EMA_diff = EMA_u / EMA_d
x,y = EMA_diff.shape
print(x, y)
RSI_n = np.empty((1, y))
for i in range(y):
if EMA_diff[0, i] == 'NaN':
RSI_n[0, i] = 'NaN'
print(i)
else:
RSI_n[0, i] = 100 / (1 + EMA_diff[0, i])
return RSI_n
#contextmanager
def show_complete_array():
oldoptions = np.get_printoptions()
np.set_printoptions(threshold=np.inf)
try:
yield
finally:
np.set_printoptions(**oldoptions)
np.set_printoptions(linewidth=3000)
pd.set_option('display.max_columns', None)
# Specyfying root folder, file folder and file
FILE = 'TVC_SILVER, 5.csv'
FOLDER = 'src'
PROJECT_ROOT_DIR = '.'
csv_path = os.path.join(PROJECT_ROOT_DIR, FOLDER, FILE)
# reading csv
price_data = pd.read_csv(csv_path, delimiter=',')
price_data_copy = price_data.copy()
price_data_nodate = price_data.copy().drop('time', axis=1)
price_data_np = price_data_nodate.to_numpy(dtype='float32')
close_price = price_data_np[:, 3]
EMA15 = EMA(close_price_arr=close_price, n=15)
EMA55 = EMA(close_price_arr=close_price, n=55)
gain = gains(close_price_arr=close_price)
loss = losses(close_price_arr=close_price)
RSI14 = RSI(gain_arr=gain, loss_arr=loss, n=14)
Try this:
"""dataset is a dataframe"""
def RSI(dataset, n=14):
delta = dataset.diff()
dUp, dDown = delta.copy(), delta.copy()
dUp[dUp < 0] = 0
dDown[dDown > 0] = 0
RolUp = pd.Series(dUp).rolling(window=n).mean()
RolDown = pd.Series(dDown).rolling(window=n).mean().abs()
RS = RolUp / RolDown
rsi= 100.0 - (100.0 / (1.0 + RS))
return rsi

List index out of range error when using Pandas and Yahoo_fin

This is a modified version of a program from a tutorial that extracts data from all of the stocks in the S&P 500 and picks stocks that match the criteria you specify.
The issue is that when I run the program List index out of range [stock symbol] pops up and those stocks are skipped and aren't added to the final CSV file.
Example:
list index out of range for ABMD
list index out of range for ABT
list index out of range for ADBE
list index out of range for ADI
I'm not really sure what the issue is, I would greatly appreciate it if someone would explain it to me! Also, I am not applying any of the specifying criteria yet and am just trying to get all of the stock data into the CSV file. Make sure to create a database named stock_data if you try the program. Thanks!
My code:
import pandas_datareader as web
import pandas as pd
from yahoo_fin import stock_info as si
import datetime as dt
dow_list = si.tickers_dow()
sp_list = si.tickers_sp500()
tickers = sp_list
'''tickers = list(set(tickers))
tickers.sort()'''
start = dt.datetime.now() - dt.timedelta(days=365)
end = dt.datetime.now()
sp500_df = web.DataReader('^GSPC', 'yahoo', start, end)
sp500_df['Pct Change'] = sp500_df['Adj Close'].pct_change()
sp500_return = (sp500_df['Pct Change'] + 1).cumprod()[-1]
return_list = []
final_df = pd.DataFrame(columns=['Ticker', 'Latest_Price', 'Score', 'PE_Ratio', 'PEG_Ratio', 'SMA_150', 'SMA_200', '52_Week_Low', '52_Week_High'])
counter = 0
for ticker in tickers:
df = web.DataReader(ticker, 'yahoo', start, end)
df.to_csv(f'stock_data/{ticker}.csv')
df['Pct Change'] = df['Adj Close'].pct_change()
stock_return = (df['Pct Change'] + 1).cumprod()[-1]
returns_compared = round((stock_return / sp500_return), 2)
return_list.append(returns_compared)
counter += 1
if counter == 100:
break
best_performers = pd.DataFrame(list(zip(tickers, return_list)), columns=['Ticker', 'Returns Compared'])
best_performers['Score'] = best_performers['Returns Compared'].rank(pct=True) * 100
best_performers = best_performers[best_performers['Score'] >= best_performers['Score'].quantile(0)] #picks stocks in top 25 percentile
for ticker in best_performers['Ticker']:
try:
df = pd.read_csv(f'stock_data/{ticker}.csv', index_col=0)
moving_averages = [150, 200]
for ma in moving_averages:
df['SMA_' + str(ma)] = round(df['Adj Close'].rolling(window=ma).mean(), 2)
latest_price = df['Adj Close'][-1]
pe_ratio = float(si.get_quote_table(ticker)['PE Ratio (TTM)'])
peg_ratio = float(si.get_stats_valuation(ticker)[1][4])
moving_average_150 = df['SMA_150'][-1]
moving_average_200 = df['SMA_200'][-1]
low_52week = round(min(df['Low'][-(52*5):]), 2)
high_52week = round(min(df['High'][-(52 * 5):]), 2)
score = round(best_performers[best_performers['Ticker'] == ticker]['Score'].tolist()[0])
condition_1 = latest_price > moving_average_150 > moving_average_200
condition_2 = latest_price >= (1.3 * low_52week)
condition_3 = latest_price >= (0.75 * high_52week)
condition_4 = pe_ratio < 25
condition_5 = peg_ratio < 2
final_df = final_df.append({'Ticker': ticker,
'Latest_Price': latest_price,
'Score': score,
'PE_Ratio': pe_ratio,
'PEG_Ratio': peg_ratio,
'SMA_150': moving_average_150,
'SMA_200': moving_average_200,
'52_Week_Low': low_52week,
'52_Week_High': high_52week}, ignore_index=True)
except Exception as e:
print(f"{e} for {ticker}")
final_df.sort_values(by='Score', ascending=False)
pd.set_option('display.max_columns', 10)
print(final_df)
final_df.to_csv('final.csv')
I have done the error shooting on your behalf. As a conclusion, I see that you have not checked the contents of the acquisition of the individual indicator data.
They are being added to the dictionary format and empty data frames as they are in index and named series. I believe that is the root cause of the error.
Specifying the last data and retrieving the values
iloc is not used.
52*5 lookbacks for 253 data
In addition, when additional indicators are acquired for the acquired issue data, there are cases where they can be acquired for the same issue, and cases where they cannot. (The cause is unknown.) Therefore, it may be necessary to change the method of processing pe_ratio and peg_ratio after obtaining them in advance.
for ticker in best_performers['Ticker']:
#print(ticker)
try:
df = pd.read_csv(f'stock_data/{ticker}.csv')#, index_col=0
moving_averages = [150, 200]
for ma in moving_averages:
df['SMA_' + str(ma)] = round(df['Adj Close'].rolling(window=ma).mean(), 2)
latest_price = df['Adj Close'][-1:].values[0]
pe_ratio = float(si.get_quote_table(ticker)['PE Ratio (TTM)'])
moving_average_150 = df['SMA_150'][-1:].values[0]
moving_average_200 = df['SMA_200'][-1:].values[0]
low_52week = round(min(df['Low'][-(52*1):]), 2)
high_52week = round(min(df['High'][-(52*1):]), 2)
#print(low_52week, high_52week)
score = round(best_performers[best_performers['Ticker'] == ticker]['Score'].tolist()[0])
#print(score)
#print(ticker, latest_price,score,pe_ratio,moving_average_200,low_52week,high_52week)
final_df = final_df.append({'Ticker': ticker,
'Latest_Price': latest_price,
'Score': score,
'PE_Ratio': pe_ratio,
'SMA_150': moving_average_150,
'SMA_200': moving_average_200,
'52_Week_Low': low_52week,
'52_Week_High': high_52week}, ignore_index=True)
#print(final_df)
except Exception as e:
print(f"{e} for {ticker}")
final_df
Ticker Latest_Price Score PE_Ratio SMA_150 SMA_200 52_Week_Low 52_Week_High
0 A 123.839996 40 31.42 147.26 150.31 123.06 126.75
1 AAP 218.250000 70 22.23 220.66 216.64 190.79 202.04
2 AAPL 165.070007 80 29.42 161.85 158.24 150.10 154.12
3 ABC 161.899994 90 21.91 132.94 129.33 132.00 137.79
4 ADBE 425.470001 10 42.46 552.19 571.99 407.94 422.38
Note
Some stocks are missing because additional indicators could not be obtained.
(tickers = sp_list[:10] tested on the first 10)

Filtering and saving subset of pandas

I have a function that does the following:
Inserting class values 1,2,3 based on timestamps. This work as inspected and in the first iteration of the first for-loop i get the following class distribution:
mapping: {'Seizure': 1, 'Preictal': 2, 'Interictal': 3}
value counts:
3.0 3150000
2.0 450000
1.0 28000
Name: class, dtype:
So i have this number of rows for each class.
However in the second forloop i iterate through the same list of timestamps and want to subset the data between the timestamps and include some conditions based on the classes i inserted in first forloop.
This is the result of the same timestamps e.g. first iteration:
len sz: 28000
len prei: 450000
len pre int: 29700000
logging
len post int: 1485499
How the * does preint and post int (interictal class) get this high of a count? it doesn't at all correspond somewhat to the number interictal in the first?
here my function.
def insert_class_col(dataframe, sz_info_list, date_converter, save_filename, save_path, file_sample_rate, file_channel):
print(f"sz_info_list: {sz_info_list}")
if "class" not in dataframe.columns:
dataframe.insert(0, "class", np.nan)
file_channel.extend(['timestamp', 'class'])
dataframe = dataframe[file_channel]
# Insert class attributes to ensure that seizure, preictal, interictal does not overlap.
for index, container in enumerate(sz_info_list):
delay = container.delay * 1000
duration = container.duration * 1000
sz_start = date_converter(container.time_emu) + delay
sz_end = sz_start + duration
print(f"sz_start index = {sz_start}")
print(f"sz_end: {sz_end}")
preictal_start = sz_start - (15 * 60 * 1000)
interictal_start = sz_start - (1 * 60 * 60 * 1000)
interictal_end = sz_end + (1 * 60 * 60 * 1000)
dataframe['timestamp'] = pd.to_numeric(dataframe['timestamp'])
# hvis data er sezure tag seizure
# hvis data er preictal tag preictal/interictal, men ikke indenfor seizure data.
dataframe.loc[(dataframe['timestamp'] >= sz_start) & (dataframe['timestamp'] < sz_end), "class"] = class_mapping['Seizure']
dataframe.loc[(dataframe['class'] != class_mapping['Seizure']) & (dataframe['timestamp'] >= preictal_start) & (dataframe['timestamp'] < sz_start), "class"] = class_mapping['Preictal']
dataframe.loc[(dataframe['class'] != class_mapping['Seizure']) & (dataframe['class'] != class_mapping['Preictal']) & (dataframe['timestamp'] >= interictal_start) & (dataframe['timestamp'] < interictal_end), "class"] = class_mapping['Interictal']
print(f"mapping: {class_mapping} \n value counts: \n{dataframe['class'].value_counts()}")
print(f"Begginging current number of class in df {dataframe['class'].value_counts()}")
# Saving to csv
for index, container in enumerate(sz_info_list):
delay = container.delay * 1000
duration = container.duration * 1000
sz_start = date_converter(container.time_emu) + delay
sz_end = sz_start + duration
print(f"sz_start index = {sz_start}")
print(f"sz_end: {sz_end}")
preictal_start = sz_start - (15 * 60 * 1000)
interictal_start = sz_start - (1 * 60 * 60 * 1000)
interictal_end = sz_end + (1 * 60 * 60 * 1000)
dataframe['timestamp'] = pd.to_numeric(dataframe['timestamp'])
#INSERTING SEIZURE CLASS
sz_df = dataframe[(dataframe['timestamp'] >= sz_start) & (dataframe['timestamp'] < sz_end)].copy()
print(f"len sz: {len(sz_df)}")
#df_save_compress(f"Seizure_{index}_{save_filename}", save_path + "/Seizure", sz_df)
#logging_info_txt(f"Seizure_{index}_{save_filename}", save_path, file_sample_rate, file_channel)
#INSERTING PREICTAL
prei_df = dataframe[(dataframe['timestamp'] >= preictal_start) & (dataframe['timestamp'] < sz_start) & (dataframe['class'] != class_mapping["Seizure"])].copy()
print(f"len prei: {len(prei_df)}")
#df_save_compress(f"Preictal_{index}_{save_filename}", save_path + "/Preictal", prei_df)
#logging_info_txt(f"Preictal_{index}_{save_filename}", save_path, file_sample_rate, file_channel)
#INSERTING INTERICTAL
pre_int_df = dataframe[(dataframe['timestamp'] >= interictal_start) & (dataframe['timestamp'] < preictal_start) & (dataframe['class'] != class_mapping["Seizure"]) | (dataframe['class'] != class_mapping["Preictal"])].copy()
print(f"len pre int: {len(pre_int_df)}")
#df_save_compress(f"PreInt_{index}_{save_filename}", save_path + "/Interictal", pre_int_df)
logging_info_txt(f"PreInt_{index}_{save_filename}", save_path, file_sample_rate, file_channel)
post_int_df = dataframe[(dataframe['timestamp'] >= sz_end) & (dataframe['timestamp'] < interictal_end) & (dataframe['class'] != class_mapping["Seizure"]) & (dataframe['class'] != class_mapping["Preictal"])].copy()
print(f"len post int: {len(post_int_df)}")
#df_save_compress(f"PostInt_{index}_{save_filename}", save_path + "/Interictal", post_int_df)
logging_info_txt(f"PostInt_{index}_{save_filename}", save_path, file_sample_rate, file_channel)
#print(f"after = len df: {len(dataframe)} values class: \n {dataframe['class'].value_counts()}")
# clean up
del pre_int_df, post_int_df, sz_df, prei_df
gc.collect()
Notice that preint which is interictal is 29700000 while printing the classes i should be lower than 3150000.
Any ideas of this pandas behavior?
#richardec answered the question see comments.

VPN Indicator ThinkScript to Python

Taking a stab at converting a ThinkScript to Python for the first time, and I think my logic is right, but I am missing something as the two plots for the indicator don't match.
Trying to convert the ThinkScript for the VPNIndicator to a Python implementation. Looking for someone knowledgeable in both languages to contribute here.
To start, the indicator plot in ThinkorSwim looks like this (bottom):
So I'm trying to replicate that plot using matplotlib finance, but first I need to translate from ThinkScript to Python, which I've attempted here:
import mplfinance as mpf
import pandas as pd
import numpy as np
import talib
def VPN_Indicator(df, params):
# def atr = WildersAverage(TrueRange(high, close, low), length);
df['H-L'] = df['High'] - df['Low']
df['H-C1'] = df['High'] - df['Close'].shift()
df['C1-L'] = df['Close'].shift() - df['Low']
df['TrueRange'] = df[['H-L','H-C1','C1-L']].max(axis=1)
df['WildersATR'] = df['TrueRange'].ewm(alpha=1.0 / params['length'], adjust=False).mean()
# def diff = hlc3 - hlc3[1];
df['Diff'] = ((df['High'] + df['Low'] + df['Close']) / 3) - ((df['High'].shift() + df['Low'].shift() + df['Close'].shift()) / 3) # Forward peak here?
# def vp = Sum(if diff > factor * atr then volume else 0, length);
df['VP_Helper'] = np.where(df['Diff'] > params['factor'] * df['WildersATR'], df['Volume'], 0)
df['VP'] = df['VP_Helper'].rolling(params['length']).sum()
# def vn = Sum(if diff < -factor * atr then volume else 0, length);
df['VN_Helper'] = np.where(df['Diff'] < -params['factor'] * df['WildersATR'], df['Volume'], 0)
df['VN'] = df['VN_Helper'].rolling(params['length']).sum()
# plot VPN = ExpAverage(100 * (vp - vn) / Sum(volume, length), emaLength);
df['RollingVol'] = df['Volume'].rolling(params['length']).sum()
df['VPN'] = talib.EMA(100 * (df['VP'] - df['VN']) / df['RollingVol'], timeperiod=params['emaLength'])
# plot VPNAvg = MovingAverage(averageType, VPN, averageLength);
if params['averageType'] in ['simple','sma','SMA','SIMPLE']:
df['VPNAvg'] = talib.SMA(df['VPN'], timeperiod=params['averageLength'])
# plot CriticalLevel = criticalValue;
df['CriticalLevel'] = params['criticalValue']
# VPN.DefineColor("Above", Color.UPTICK);
# VPN.DefineColor("Below", Color.DOWNTICK);
# VPN.AssignValueColor(if VPN > CriticalLevel then VPN.Color("Above") else VPN.Color("Below"));
# VPNAvg.SetDefaultColor(GetColor(7));
# CriticalLevel.SetDefaultColor(GetColor(1));
# Gimicks, don't need the top bit for now
return df
params = {
"length": 30,
"emaLength": 3,
"averageLength": 30,
"factor": 0.1,
"criticalValue": 10,
"averageType": "simple"
}
# Import a 1min dataset and rename columns as necessary
df = pd.read_csv("SPY.csv").iloc[-2000:,:]
df['time'] = pd.to_datetime(df['time'])
df = df.set_index('time')
df = df.rename(columns={'open':'Open', 'high':'High', 'low':"Low", "close": "Close", "volume": "Volume"})
df = VPN_Indicator(df, params)
# Plot the results
apds = [ mpf.make_addplot((df['CriticalLevel']), panel=2, color='g'),
mpf.make_addplot((df['VPN']), panel=2, color='g'),
mpf.make_addplot((df['VPNAvg']), panel=2, color='g'),
]
mpf.plot(df[['Open', 'High', 'Low', 'Close', 'Volume']], addplot=apds, figscale=1.2, volume=True)
... which results in a plot that looks like this:
... which is close, but the peaks don't line up with the ThinkOrSwim plot. So I'm wanting to know from someone who knows these languages where I might be off? Thanks!
Try using this to calculate ATR. This gives the same output as TOS.
import numpy as np
def ema(arr, periods=14, weight=1, init=None):
leading_na = np.where(~np.isnan(arr))[0][0]
arr = arr[leading_na:]
alpha = weight / (periods + (weight-1))
alpha_rev = 1 - alpha
n = arr.shape[0]
pows = alpha_rev**(np.arange(n+1))
out1 = np.array([])
if 0 in pows:
out1 = ema(arr[:int(len(arr)/2)], periods)
arr = arr[int(len(arr)/2) - 1:]
init = out1[-1]
n = arr.shape[0]
pows = alpha_rev**(np.arange(n+1))
scale_arr = 1/pows[:-1]
if init:
offset = init * pows[1:]
else:
offset = arr[0]*pows[1:]
pw0 = alpha*alpha_rev**(n-1)
mult = arr*pw0*scale_arr
cumsums = mult.cumsum()
out = offset + cumsums*scale_arr[::-1]
out = out[1:] if len(out1) > 0 else out
out = np.concatenate([out1, out])
out[:periods] = np.nan
out = np.concatenate(([np.nan]*leading_na, out))
return out
def atr(highs, lows, closes, periods=14, ema_weight=1):
hi = np.array(highs)
lo = np.array(lows)
c = np.array(closes)
tr = np.vstack([np.abs(hi[1:]-c[:-1]),
np.abs(lo[1:]-c[:-1]),
(hi-lo)[1:]]).max(axis=0)
atr = ema(tr, periods=periods, weight=ema_weight)
atr = np.concatenate([[np.nan], atr])
return atr

Dataframe and updating a new column value in a for loop

I am trying to update a value in a dataframe using a method and a forloop. I pass the dataframe into the method and use a for loop to calculate the value I want to put into the last column.
Here is the method
def vwap2(df):
sumTpv = 0.00
sumVolume = 0
dayVwap = 0.00
for i, row in df.iterrows():
#Get all values from each row
#Find typical price
tp = (row['HIGH'] + row['LOW'] + row['CLOSE'] + row['OPEN']) / 4
tpv = tp * row['VOLUME']
sumTpv= sumTpv + tpv
sumVolume = sumVolume + row['VOLUME']
vwap = sumTpv / sumVolume
#Find VWAP
#df.assign(VWAP = vwap)
#row.assign(VWAP = vwap)
#row["VWAP"] = vwap
df.set_value(row, 'VWAP', vwap)
df = df.reindex(row = row)
df[row] = df[row].astype(float)
dayVwap = dayVwap + vwap
print('Day VWAP = ', dayVwap)
print('TPV sum = ', sumTpv)
print('Day Volume = ', sumVolume)
return df
And the Dataframe already has the column in it as I add it to it before I pass the df into the method. Like this
df["VWAP"] = ""
#do vwap calculation
df = vwap2(df)
But the values either are all the same which should not be or are not written. I tried a few things but to no success.
Updates
Here is the data that I am using, I am pulling it from Google each time:
CLOSE HIGH LOW OPEN VOLUME TP \
2018-05-10 22:30:00 97.3600 97.48 97.3000 97.460 371766 97.86375
1525991460000000000 97.2900 97.38 97.1800 97.350 116164 97.86375
1525991520000000000 97.3100 97.38 97.2700 97.270 68937 97.86375
1525991580000000000 97.3799 97.40 97.3101 97.330 46729 97.86375
1525991640000000000 97.2200 97.39 97.2200 97.365 64823 97.86375
TPV SumTPV SumVol VWAP
2018-05-10 22:30:00 3.722224e+08 1.785290e+09 18291710 97.601027
1525991460000000000 3.722224e+08 1.785290e+09 18291710 97.601027
1525991520000000000 3.722224e+08 1.785290e+09 18291710 97.601027
1525991580000000000 3.722224e+08 1.785290e+09 18291710 97.601027
1525991640000000000 3.722224e+08 1.785290e+09 18291710 97.601027
As you can see all the calculated stuff is the same.
Here is what I am using right now.
def vwap2(df):
sumTpv = 0.00
sumVolume = 0
dayVwap = 0.00
for i, row in df.iterrows():
#Get all values from each row
#Find typical price
tp = (row['HIGH'] + row['LOW'] + row['CLOSE'] + row['OPEN']) / 4
df['TP'] = tp
tpv = tp * row['VOLUME']
df['TPV'] = tpv
sumTpv= sumTpv + tpv
df['SumTPV'] = sumTpv
sumVolume = sumVolume + row['VOLUME']
df['SumVol'] = sumVolume
vwap = sumTpv / sumVolume
#Find VWAP
#row.assign(VWAP = vwap)
#row["VWAP"] = vwap
#df.set_value(row, 'VWAP', vwap)
df["VWAP"] = vwap
dayVwap = dayVwap + vwap
print('Day VWAP = ', dayVwap)
print('TPV sum = ', sumTpv)
print('Day Volume = ', sumVolume)
return df
IIUC, you don't need a loop, or even apply - you can use direct column assignment and cumsum() to get what you're looking for.
Some example data:
import numpy as np
import pandas as pd
N = 20
high = np.random.random(N)
low = np.random.random(N)
close = np.random.random(N)
opening = np.random.random(N)
volume = np.random.random(N)
data = {"HIGH":high, "LOW":low, "CLOSE":close, "OPEN":opening, "VOLUME":volume}
df = pd.DataFrame(data)
df.head()
CLOSE HIGH LOW OPEN VOLUME
0 0.848676 0.260967 0.004188 0.139342 0.931406
1 0.771065 0.356639 0.495715 0.652106 0.988217
2 0.288206 0.567776 0.023687 0.809410 0.134134
3 0.832711 0.508586 0.031569 0.120774 0.891948
4 0.857051 0.391618 0.155635 0.069054 0.628036
Assign the tp and tpv columns directly, then apply cumsum to get sumTpv and sumVolume:
df["tp"] = (df['HIGH'] + df['LOW'] + df['CLOSE'] + df['OPEN']) / 4
df["tpv"] = df.tp * df['VOLUME']
df["sumTpv"] = df.tpv.cumsum()
df["sumVolume"] = df.VOLUME.cumsum()
df["vwap"] = df.sumTpv.div(df.sumVolume)
df.head()
CLOSE HIGH LOW OPEN VOLUME tp tpv \
0 0.848676 0.260967 0.004188 0.139342 0.931406 0.313293 0.291803
1 0.771065 0.356639 0.495715 0.652106 0.988217 0.568881 0.562178
2 0.288206 0.567776 0.023687 0.809410 0.134134 0.422270 0.056641
3 0.832711 0.508586 0.031569 0.120774 0.891948 0.373410 0.333063
4 0.857051 0.391618 0.155635 0.069054 0.628036 0.368340 0.231331
sumTpv sumVolume vwap
0 0.291803 0.931406 0.313293
1 0.853982 1.919624 0.444869
2 0.910622 2.053758 0.443393
3 1.243685 2.945706 0.422203
4 1.475016 3.573742 0.412737
Update (per OP comment):
To get dayVwap as the sum of all vwap, use dayVwap = df.vwap.sum().

Categories