I am calculating the standard deviation of the rolling mean (Bollinger Bands, example here is very simplified) in a pandas dataframe like this:
import pandas as pd
import numpy as np
no_of_std = 3
window = 20
df = pd.DataFrame({'A': [34, 34, 34, 33, 32, 34, 35.0, 21, 22, 25, 23, 21, 39, 26, 31, 34, 38, 26, 21, 39, 31]})
rolling_mean = df['A'].rolling(window).mean()
rolling_std = df['A'].rolling(window).std(ddof=0)
df['M'] = rolling_mean
df['BBL'] = rolling_mean - (rolling_std * no_of_std)
df['BBH'] = rolling_mean + (rolling_std * no_of_std)
print (df)
The result looks like this:
A M BBL BBH
0 34.0 NaN NaN NaN
1 34.0 NaN NaN NaN
2 34.0 NaN NaN NaN
3 33.0 NaN NaN NaN
4 32.0 NaN NaN NaN
5 34.0 NaN NaN NaN
6 35.0 NaN NaN NaN
7 21.0 NaN NaN NaN
8 22.0 NaN NaN NaN
9 25.0 NaN NaN NaN
10 23.0 NaN NaN NaN
11 21.0 NaN NaN NaN
12 39.0 NaN NaN NaN
13 26.0 NaN NaN NaN
14 31.0 NaN NaN NaN
15 34.0 NaN NaN NaN
16 38.0 NaN NaN NaN
17 26.0 NaN NaN NaN
18 21.0 NaN NaN NaN
19 39.0 30.10 11.633544 48.566456
20 31.0 29.95 11.665375 48.234625
Now i want to calculate in the other direction which value the last value in the column 'A' needs to have to hit exactly the 3rd standard deviation of the rolling mean.
That means in other words i want to calculate: which value needs A to have in a next row nr.15 that it will be exactly the same as the value in BBH or BBL.
I can do this by recursive approximation but this needs a lot of perfomance and i think there must be a better way. Here is an example for the solution from which i think it is to slow and there must be a better faster way:
import pandas as pd
odf = pd.DataFrame({'A': [34, 34, 34, 33, 32, 34, 35.0, 21, 22, 25, 23, 21, 39, 26, 31, 34, 38, 26, 21, 39, 31]})
def get_last_bbh_bbl(idf):
xdf = idf.copy()
no_of_std = 3
window = 20
rolling_mean = xdf['A'].rolling(window).mean()
rolling_std = xdf['A'].rolling(window).std()
xdf['M'] = rolling_mean
xdf['BBL'] = rolling_mean - (rolling_std * no_of_std)
xdf['BBH'] = rolling_mean + (rolling_std * no_of_std)
bbh = xdf.loc[len(xdf) - 1, 'BBH']
bbl = xdf.loc[len(xdf) - 1, 'BBL']
return bbh, bbl
def search_matching_value(idf, low, high, search_for):
xdf = idf.copy()
if abs(high-low) < 0.000001:
return high
middle = low + ((high-low)/2)
xdf = xdf.append({'A' : middle}, ignore_index=True)
bbh, bbl = get_last_bbh_bbl(xdf)
if search_for == 'bbh':
if bbh < middle:
result=search_matching_value(idf, low, middle, search_for)
elif bbh > middle:
result=search_matching_value(idf, middle, high, search_for)
else:
return middle
elif search_for == 'bbl':
if bbl > middle:
result=search_matching_value(idf, middle, high, search_for)
elif bbl < middle:
result=search_matching_value(idf, low, middle, search_for)
else:
return middle
return result
actual_bbh, actual_bbl = get_last_bbh_bbl(odf)
last_value = odf.loc[len(odf) - 1, 'A']
print('last_value: {}, actual bbh: {}, actual bbl: {}'.format(last_value, actual_bbh, actual_bbl))
low = last_value
high = actual_bbh * 10
next_value_that_hits_bbh = search_matching_value(odf, low, high, 'bbh')
print ('next_value_that_hits_bbh: {}'.format(next_value_that_hits_bbh))
low=0
high=last_value
next_value_that_hits_bbl = search_matching_value(odf, low, high, 'bbl')
print ('next_value_that_hits_bbl: {}'.format(next_value_that_hits_bbl))
the result looks like this:
last_value: 31.0, actual bbh: 48.709629106422284, actual bbl: 11.190370893577711
next_value_that_hits_bbh: 57.298733206475276
next_value_that_hits_bbl: 2.174952656030655
here one solution to calculate next value with fast algorithm: newton opt and newton classic are faster than dichotomy and this solution dont use dataframe to recalculate the different value, i use directly the statistic function from the library of same name
some info for scipy.optimize.newton
from scipy import misc
import pandas as pd
import statistics
from scipy.optimize import newton
#scipy.optimize if you want to test the newton optimized function
def get_last_bbh_bbl(idf):
xdf = idf.copy()
rolling_mean = xdf['A'].rolling(window).mean()
rolling_std = xdf['A'].rolling(window).std()
xdf['M'] = rolling_mean
xdf['BBL'] = rolling_mean - (rolling_std * no_of_std)
xdf['BBH'] = rolling_mean + (rolling_std * no_of_std)
bbh = xdf.loc[len(xdf) - 1, 'BBH']
bbl = xdf.loc[len(xdf) - 1, 'BBL']
lastvalue = xdf.loc[len(xdf) - 1, 'A']
return lastvalue, bbh, bbl
#classic newton
def NewtonsMethod(f, x, tolerance=0.00000001):
while True:
x1 = x - f(x) / misc.derivative(f, x)
t = abs(x1 - x)
if t < tolerance:
break
x = x1
return x
#to calculate the result of function bbl(x) - x (we want 0!)
def low(x):
l = lastlistofvalue[:-1]
l.append(x)
avg = statistics.mean(l)
std = statistics.stdev(l, avg)
return avg - std * no_of_std - x
#to calculate the result of function bbh(x) - x (we want 0!)
def high(x):
l = lastlistofvalue[:-1]
l.append(x)
avg = statistics.mean(l)
std = statistics.stdev(l, avg)
return avg + std * no_of_std - x
odf = pd.DataFrame({'A': [34, 34, 34, 33, 32, 34, 35.0, 21, 22, 25, 23, 21, 39, 26, 31, 34, 38, 26, 21, 39, 31]})
no_of_std = 3
window = 20
lastlistofvalue = odf['A'].shift(0).to_list()[::-1][:window]
"""" Newton classic method """
x = odf.loc[len(odf) - 1, 'A']
x0 = NewtonsMethod(high, x)
print(f'value to hit bbh: {x0}')
odf = pd.DataFrame({'A': [34, 34, 34, 33, 32, 34, 35.0, 21, 22, 25, 23, 21, 39, 26, 31, 34, 38, 26, 21, 39, 31, x0]})
lastvalue, new_bbh, new_bbl = get_last_bbh_bbl(odf)
print(f'value to hit bbh: {lastvalue} -> check new bbh: {new_bbh}')
x0 = NewtonsMethod(low, x)
print(f'value to hit bbl: {x0}')
odf = pd.DataFrame({'A': [34, 34, 34, 33, 32, 34, 35.0, 21, 22, 25, 23, 21, 39, 26, 31, 34, 38, 26, 21, 39, 31, x0]})
lastvalue, new_bbh, new_bbl = get_last_bbh_bbl(odf)
print(f'value to hit bbl: {lastvalue} -> check new bbl: {new_bbl}')
output:
value to hit bbh: 57.298732375228624
value to hit bbh: 57.298732375228624 -> check new bbh: 57.29873237527272
value to hit bbl: 2.1749518354059636
value to hit bbl: 2.1749518354059636 -> check new bbl: 2.1749518353102992
you could compare the newton optimized like:
""" Newton optimized method """
x = odf.loc[len(odf) - 1, 'A']
x0 = newton(high, x, fprime=None, args=(), tol=1.00e-08, maxiter=50, fprime2=None)
print(f'Newton opt value to hit bbh: {x0}')
x0 = newton(low, x, fprime=None, args=(), tol=1.48e-08, maxiter=50, fprime2=None)
print(f'Newton value to hit bbl: {x0}')
output:
Newton opt value to hit bbh: 57.29873237532118
Newton value to hit bbl: 2.1749518352051225
with the newton optimized, you could play with the max iteration
and optimized is faster than classic:
measures for each calculus
0.002 sec for optimized
0.005 sec for classic
*Remarks: *
if you use rolling(window).std() you are using the standard deviation so you have to use
std = statistics.stdev(l, avg) you divide by N-1 items
if you use rolling(window).std(ddof=0) you are using the population deviation so you have to use
std = statistics.pstdev(l, avg) you divide by N items
Related
I am testing my functions that calculates price indicators and I have a strange BUG that I don't know how to resolve.
EDIT: Columns in the csv I've shared are all lower case, in case of testing the function with this csv you'd like to use this code:
data = pd.read_csv(csv_path)
data = data.drop(['symbol'], axis=1)
data.rename(columns={'open': 'Open', 'high': 'High', 'low': 'Low', 'close': 'Close', 'volume': 'Volume'}, inplace=True)
Link to data .csv file
You can try it using the function with default arguments. (on the bottom of the post I am also sharing an auxilliary input_type function, just make sure not to use input mode higher than 4, since HL2, HLC3, OHLC4 and HLCC4 input modes are not calculated for this csv.
So I am calculating Weighted Moving Average using this function:
(I am testing this function with default arguments)
def wma(price_df: PandasDataFrame, n: int = 14, input_mode: int = 2, from_price: bool = True, *,
indicator_name: str = 'None') -> PandasDataFrame:
if from_price:
name_var, state = input_type(__input_mode__=input_mode)
else:
if indicator_name == 'None':
raise TypeError('Invalid input argument. indicator_name cannot be set to None if from_price is False.')
else:
name_var = indicator_name
wma_n = pd.DataFrame(index=range(price_df.shape[0]), columns=range(1))
wma_n.rename(columns={0: f'WMA{n}'}, inplace=True)
weight = np.arange(1, (n + 1)).astype('float64')
weight = weight * n
norm = sum(weight)
weight_df = pd.DataFrame(weight)
weight_df.rename(columns={0: 'weight'}, inplace=True)
product = pd.DataFrame()
product_sum = 0
for i in range(price_df.shape[0]):
if i < (n - 1):
# creating NaN values where it is impossible to calculate EMA to drop the later
wma_n[f'WMA{n}'].iloc[i] = np.nan
elif i == (n - 1):
product = price_df[f'{name_var}'].iloc[:(i + 1)] * weight_df['weight']
product_sum = product.sum()
wma_n[f'WMA{n}'].iloc[i] = product_sum / norm
print(f'index: {i}, wma: ', wma_n[f'WMA{n}'].iloc[i])
print(product_sum)
print(norm)
product = product.iloc[0:0]
product_sum = 0
elif i > (n - 1):
product = price_df[f'{name_var}'].iloc[(i - (n - 1)): (i + 1)] * weight_df['weight']
product_sum = product.sum()
wma_n[f'WMA{n}'].iloc[i] = product_sum / norm
print(f'index: {i}, wma: ', wma_n[f'WMA{n}'].iloc[i])
print(product_sum)
print(norm)
product = product.iloc[0:0]
product_sum = 0
return wma_n
For some reason the value drops to 0.0 after 26 iteration, and I have no earthly idea why.
Can someone please help me?
My output:
index: 13, wma: 14467.42857142857
product_sum: 21267120.0
norm 1470.0
index: 14, wma: 14329.609523809524
product_sum: 21064526.0
norm 1470.0
index: 15, wma: 14053.980952380953
product_sum: 20659352.0
norm 1470.0
index: 16, wma: 13640.480952380953
product_sum: 20051507.0
norm 1470.0
index: 17, wma: 13089.029523809522
product_sum: 19240873.4
norm 1470.0
index: 18, wma: 12399.72
product_sum: 18227588.4
norm 1470.0
index: 19, wma: 11572.234285714285
product_sum: 17011184.4
norm 1470.0
index: 20, wma: 10607.100952380953
product_sum: 15592438.4
norm 1470.0
index: 21, wma: 9504.32
product_sum: 13971350.4
norm 1470.0
index: 22, wma: 8263.905714285715
product_sum: 12147941.4
norm 1470.0
index: 23, wma: 6885.667619047619
product_sum: 10121931.4
norm 1470.0
index: 24, wma: 5369.710476190477
product_sum: 7893474.4
norm 1470.0
index: 25, wma: 3716.270476190476
product_sum: 5462917.6
norm 1470.0
index: 26, wma: 1926.48
product_sum: 2831925.6
norm 1470.0
index: 27, wma: 0.0
product_sum: 0.0
norm 1470.0
index: 28, wma: 0.0
product_sum: 0.0
norm 1470.0
Auxilliary function needed to run my function.
def input_type(__input_mode__: int) -> (str, bool):
list_of_inputs = ['Open', 'Close', 'High', 'Low', 'HL2', 'HLC3', 'OHLC4', 'HLCC4']
if __input_mode__ in range(1, 10, 1):
input_name = list_of_inputs[__input_mode__ - 1]
state = True
return input_name, state
else:
raise TypeError('__input_mode__ out of range.')
This problem is caused by a Pandas feature called alignment. Imagine you have two dataframes. One dataframe shows how much you own of each stock. The other DataFrame shows the stock price of each one. However, they're not in the same order, and there's missing data.
df_shares_held = pd.DataFrame({'shares': [1, 5, 10]}, index=['ABC', 'DEF', 'XYZ'])
df_price_per_share = pd.DataFrame({'price': [0.54, 1.1]}, index=['XYZ', 'ABC'])
These dataframes look like this:
shares
ABC 1
DEF 5
XYZ 10
price
XYZ 0.54
ABC 1.10
Pandas will let you multiply these two columns together.
print(df_shares_held['shares'] * df_price_per_share['price'])
ABC 1.1
DEF NaN
XYZ 5.4
dtype: float64
Notice it matched up the price for ABC with the number of shares for ABC, despite them being in different orders in the original dataframes. DEF, which is missing a share price, now becomes NaN, because one side of the multiplication is missing a value.
Pandas is doing something similar here. This is the value of price_df[f'{name_var}'].iloc[(i - (n - 1)): (i + 1)] partway through the loop:
1 14470.5
2 14472.5
3 14475.6
4 14475.5
5 14481.0
6 14477.0
7 14474.0
8 14471.5
9 14471.5
10 14470.5
11 14467.6
12 14456.0
13 14448.6
14 14446.6
Name: Close, dtype: float64
Notice this starts at 1 and ends at 14.
This is the value of weights_df['weights'] in the same loop:
0 14.0
1 28.0
2 42.0
3 56.0
4 70.0
5 84.0
6 98.0
7 112.0
8 126.0
9 140.0
10 154.0
11 168.0
12 182.0
13 196.0
Name: weight, dtype: float64
Notice this starts at 0 and ends at 13.
And this is the product of the two:
0 NaN
1 405174.0
2 607845.0
3 810633.6
4 1013285.0
5 1216404.0
6 1418746.0
7 1621088.0
8 1823409.0
9 2026010.0
10 2228457.0
11 2430556.8
12 2630992.0
13 2831925.6
14 NaN
dtype: float64
You now have NaNs for the first and last value, and only 13 real values. Each time it loops, it will lose one more value.
But why does it return zero, and not NaN? Pandas ignores NaN values when doing a sum over a column. If you sum only NaN values, then it returns zero.
So how can you avoid alignment? There are many ways.
Approach #1: You could call reset_index():
product = price_df[f'{name_var}'].iloc[(i - (n - 1)): (i + 1)].reset_index(drop=True) * weight_df['weight']
This puts the index back to starting at zero.
Approach #2: You could use numpy to do the calculation. Numpy doesn't care about alignment.
product = price_df[f'{name_var}'].iloc[(i - (n - 1)): (i + 1)].values * weight_df['weight'].values
Approach #3: Pandas already has a way to calculate what you're looking for - they're called rolling window calculations.
import numpy as np
def wma2(price_df, n: int = 14, input_mode: int = 2, from_price: bool = True, *,
indicator_name: str = 'None'):
if from_price:
name_var, state = input_type(__input_mode__=input_mode)
else:
if indicator_name == 'None':
raise TypeError('Invalid input argument. indicator_name cannot be set to None if from_price is False.')
else:
name_var = indicator_name
weights = np.arange(1, (n + 1)).astype('float64')
weights_normalized = weights / weights.sum()
wma_series = price_df['Close'].rolling(n).apply(
lambda window: np.dot(window, weights_normalized)
)
return pd.DataFrame({f'WMA{n}': wma_series})
This is not only simpler, but faster too.
I think the reason this is happening is because your weight_df has indices 0-13, but when you iterate over your price_df, the indices will be 0-13 at first, then 1-14, then 2-15, 3-16, 4-17, etc. This means that when you multiply those together:
product = price_df[f'{name_var}'].iloc[(i - (n - 1)): (i + 1)] * weight_df['weight']
You will be getting a whole bunch of NaN values due to indices not aligning! Here is an illustration of what is getting progressively worse:
import pandas as pd
a = pd.Series([4, 5, 6], index=[1, 2, 3])
b = pd.Series([1, 2, 3], index=[3, 4, 5])
out = a * b
out:
1 NaN
2 NaN
3 6.0
4 NaN
5 NaN
In your case, the index of both weight_df and price_df are drifting apart more and more as you iterate, creating more and more NaNs.
I'm sure this can be solved, but I would highly recommend doing this in a more "pandas" manner. Have a look at this SO post: https://stackoverflow.com/a/53833851/9499196
Pandas DataFrames provide the .rolling method, which generates the windows you're trying to create manually for you. You can then apply a function (your weighted average) to each window by calling .apply on the Rolling object returned by price_df[your_col].rolling().
data = {
'aapl': [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
'aal' : [33, 33, 33, 32, 31, 30, 34, 29, 27, 26],
}
data = pd.DataFrame(data)
data.index = pd.date_range('2011-01-01', '2011-01-10')
n_obs = len(data) * 0.3
train, test = data[:n_obs], data[n_obs:]
>>> TypeError: cannot do slice indexing on DatetimeIndex with these indexers [3.0] of type float
I can probably slice the dataframe by date like df[ : '2011-01-05' ], but I want to be splitting the data by number of observations, which I have difficulties using the method above.
You need to ensure having an integer for slicing:
n_obs = int(len(data) * 0.3)
train, test = data[:n_obs], data[n_obs:]
output:
# train
aapl aal
2011-01-01 11 33
2011-01-02 12 33
2011-01-03 13 33
# test
aapl aal
2011-01-04 14 32
2011-01-05 15 31
2011-01-06 16 30
2011-01-07 17 34
2011-01-08 18 29
2011-01-09 19 27
2011-01-10 20 26
If you want to train/test a model you might be interested in getting a random sample:
test = data.sample(frac=0.3)
train = data.loc[data.index.difference(test.index)]
Suppose I have the following DataFrame:
df = pd.DataFrame({'id': [2, 4, 10, 12, 13, 14, 19, 20, 21, 22, 24, 25, 27, 29, 30, 31, 42, 50, 54],
'value': [37410.0, 18400.0, 200000.0, 392000.0, 108000.0, 423000.0, 80000.0, 307950.0,
50807.0, 201740.0, 182700.0, 131300.0, 282005.0, 428800.0, 56000.0, 412400.0, 1091595.0, 1237200.0,
927500.0]})
And I do the following:
df.sort_values(by='id').set_index('id').cumsum()
value
id
2 37410.0
4 55810.0
10 255810.0
12 647810.0
13 755810.0
14 1178810.0
19 1258810.0
20 1566760.0
21 1617567.0
22 1819307.0
24 2002007.0
25 2133307.0
27 2415312.0
29 2844112.0
30 2900112.0
31 3312512.0
42 4404107.0
50 5641307.0
54 6568807.0
I want to know the first element of id that is bigger than 25% of the cumulative sum. In this example, 25% of the cumsum would be 1,642,201.75. The first element to exceed that would be 22. I know it can be done with a for, but I think it would be pretty inefficient.
You could do:
percentile_25 = df['value'].sum() * 0.25
res = df[df['value'].cumsum() > percentile_25].head(1)
print(res)
Output
id value
9 22 201740.0
Or use searchsorted to do the search in O(log N):
percentile_25 = df['value'].sum() * 0.25
i = df['value'].cumsum().searchsorted(percentile_25)
res = df.iloc[i]
print(res)
Output
id 22.0
value 201740.0
Name: 9, dtype: float64
I am trying to write code that loops over the following code for columns in a dataframe: four times for four different arrays:
median_alcohol = df.alcohol.median()
for i, alcohol in enumerate(df.alcohol):
if alcohol >= median_alcohol:
df.loc[i, 'alcohol'] = 'high'
else:
df.loc[i, 'alcohol'] = 'low'
df.groupby('alcohol').quality.mean()
The columns in the dataframe are:
alcohol
pH
residual_sugar
citric_acid
I am trying to come up with a method to capture the four different arrays. Any ideas how I should go about this?
I'm not sure what actually you're trying to do, but, from what I understood, you could try something like this:
import pandas as pd
from statistics import mean
df = pd.DataFrame({'alcohol':[45, 88, 56, 15, 71], 'pH':[12, 83, 56, 25,71],'residual_sugar':[14, 25, 55, 8, 21]})
print(df)
#Output
>>> alcohol pH residual_sugar
0 45 12 14
1 88 83 25
2 56 56 55
3 15 25 8
4 71 71 21
def func(colum):
dftemp=df.copy()
median_colum = eval('df.'+colum).median()
for i, item in enumerate(eval('df.'+colum)):
dftemp.loc[i, colum] = 'high' if item >= median_colum else 'low'
return dftemp.groupby(colum).agg(list).applymap(mean)
diferrentarrays = [func(i) for i in df.columns]
for array in diferrentarrays:
print(array)
Output:
pH residual_sugar
alcohol
high 70.0 33.666667
low 18.5 11.000000
alcohol residual_sugar
pH
high 71.666667 33.666667
low 30.000000 11.000000
alcohol pH
residual_sugar
high 71.666667 70.0
low 30.000000 18.5
def numeric_to_buckets(df, column_name):
median = df[column_name].median()
for i, val in enumerate(df[column_name]):
if val >= median:
df.loc[i, column_name] = 'high'
else:
df.loc[i, column_name] = 'low'
for feature in df.columns[:-1]:
numeric_to_buckets(df, feature)
print(df.groupby(feature).quality.mean(), '\n')
I have a dataframe with 4 sections
Section 1: Product details
Section 2: 6 Potential product values based on a range of simulations
Section 3: Upper and lower bound for the input parameter to the simulations
Section 4: Randomly generated values for the input parameters
Section 2 is generated by pricing the product at equal intervals between the upper and lower bound.
I need to take the values in Section 4 and figure out the corresponding product value. Here is a possible setup for this dataframe:
table2 = pd.DataFrame({
'Product Type': ['A', 'B', 'C', 'D'],
'State_1_Value': [10, 11, 12, 13],
'State_2_Value': [20, 21, 22, 23],
'State_3_Value': [30, 31, 32, 33],
'State_4_Value': [40, 41, 42, 43],
'State_5_Value': [50, 51, 52, 53],
'State_6_Value': [60, 61, 62, 63],
'Lower_Bound': [-1, 1, .5, 5],
'Upper_Bound': [1, 2, .625, 15],
'sim_1': [0, 0, .61, 7],
'sim_2': [1, 1.5, .7, 9],
})
>>> table2
Lower_Bound Product Type State_1_Value State_2_Value State_3_Value \
0 -1.0 A 10 20 30
1 1.0 B 11 21 31
2 0.5 C 12 22 32
3 5.0 D 13 23 33
State_4_Value State_5_Value State_6_Value Upper_Bound sim_1 sim_2
0 40 50 60 1.000 0.0 1.0
1 41 51 61 2.000 0.0 1.5
2 42 52 62 0.625 0.61 0.7
3 43 53 63 15.000 7.0 9.0
I will run through a couple examples of this calculation to make it clear what my question is.
Product A - sim_2
The input here is 1.0. This is equal to the upper bound for this product. Therefore the simulation value is equivalent to the state_6 value - 60
Product B - sim_2
The input here is 1.5. the LB to UB range is (1,2), therefore the 6 states are {1,1.2,1.4,1.6,1.8,2}. 1.5 is exactly in the middle of state_3 which has a value of 31 and state 4 which has a value of 41. Therefore the simulation value is 36.
Product C - sim_1
The input here is .61. The LB to UB range is (.5,.625), therefore the 6 states are {.5,.525,.55,.575,.6,.625}. .61 is between state 5 and 6. Specifically the bucket it would fall under would be 5*(.61-.5)/(.625-.5)+1 = 5.4 (it is multiplied by 5 as that is the number of intervals - you can calculate it other ways and get the same result). Then to calculate the value we use that bucket in a weighing of the values for state 5 and state 6: (62-52)*(5.4-5)+52 = 56.
Product B - sim_1
The input here is 0 which is below the lower bound of 1. Therefore we need to extrapolate the value. We use the same formula as above we just use the values of state 1 and state 2 to extrapolate. The bucket would be 5*(0-1)/(2-1)+1 = -4. The two values used at 11 and 21, so the value is (21-11)*(-4-1)+11= -39
I've also simplified the problem to try to visualize the solution, my final code needs to run on 500 values and 10,000 simulations, and the dataframe will have about 200 rows.
Here are the formulas I've used for the interpolation although I'm not committed to them specifically.
Bucket = N*(sim_value-LB)/(UB-LB) + 1
where N is the number of intervals
then nLower is the state value directly below the bucket, and nHigher is the state value directly above the bucket. If the bucket is outside the UB/LB, then force nLower and nHigher to be either the first two or last two values.
Final_value = (nHigher-nLower)*(Bucket1 - number_value_of_nLower)+nLower
To summarize, my question is how I can generate the final results based on the combination of input data provided. The most challenging part to me is how to make the connection from the Bucket number to the nLower and nHigher values.
I was able to generate the result using the following code. I'm not sure of the memory implications on a large dataframe, so still interested in better answers or improvements.
Edit: Ran this code on the full dataset, 141 rows, 500 intervals, 10,000 simulations, and it took slightly over 1.5 hours. So not quite as useless as I assumed, but there is probably a smarter way of doing this in a tiny fraction of that time.
for i in range(1,3):
table2['Bucket%s'%i] = 5 * (table2['sim_%s'%i] - table2['Lower_Bound']) / (table2['Upper_Bound'] - table2['Lower_Bound']) + 1
table2['lv'] = table2['Bucket%s'%i].map(int)
table2['hv'] = table2['Bucket%s'%i].map(int) + 1
table2.ix[table2['lv'] < 1 , 'lv'] = 1
table2.ix[table2['lv'] > 5 , 'lv'] = 5
table2.ix[table2['hv'] > 6 , 'hv'] = 6
table2.ix[table2['hv'] < 2 , 'hv'] = 2
table2['nLower'] = table2.apply(lambda row: row['State_%s_Value'%row['lv']],axis=1)
table2['nHigher'] = table2.apply(lambda row: row['State_%s_Value'%row['hv']],axis=1)
table2['Final_value_%s'%i] = (table2['nHigher'] - table2['nLower'])*(table2['Bucket%s'%i]-table2['lv']) + table2['nLower']
Output:
>>> table2
Lower_Bound Product Type State_1_Value State_2_Value State_3_Value \
0 -1.0 A 10 20 30
1 1.0 B 11 21 31
2 0.5 C 12 22 32
3 5.0 D 13 23 33
State_4_Value State_5_Value State_6_Value Upper_Bound sim_1 sim_2 \
0 40 50 60 1.000 0.00 1.0
1 41 51 61 2.000 0.00 1.5
2 42 52 62 0.625 0.61 0.7
3 43 53 63 15.000 7.00 9.0
Bucket1 lv hv nLower nHigher Final_value_1 Bucket2 Final_value_2
0 3.5 5 6 50 60 35.0 6.0 60.0
1 -4.0 3 4 31 41 -39.0 3.5 36.0
2 5.4 5 6 52 62 56.0 9.0 92.0
3 2.0 3 4 33 43 23.0 3.0 33.0
I posted a superior solution with no loops here:
Alternate method to avoid loop in pandas dataframe
df= pd.DataFrame({
'Product Type': ['A', 'B', 'C', 'D'],
'State_1_Value': [10, 11, 12, 13],
'State_2_Value': [20, 21, 22, 23],
'State_3_Value': [30, 31, 32, 33],
'State_4_Value': [40, 41, 42, 43],
'State_5_Value': [50, 51, 52, 53],
'State_6_Value': [60, 61, 62, 63],
'Lower_Bound': [-1, 1, .5, 5],
'Upper_Bound': [1, 2, .625, 15],
'sim_1': [0, 0, .61, 7],
'sim_2': [1, 1.5, .7, 9],
})
buckets = df.ix[:,-2:].sub(df['Lower_Bound'],axis=0).div(df['Upper_Bound'].sub(df['Lower_Bound'],axis=0),axis=0) * 5 + 1
low = buckets.applymap(int)
high = buckets.applymap(int) + 1
low = low.applymap(lambda x: 1 if x < 1 else x)
low = low.applymap(lambda x: 5 if x > 5 else x)
high = high.applymap(lambda x: 6 if x > 6 else x)
high = high.applymap(lambda x: 2 if x < 2 else x)
low_value = pd.DataFrame(df.filter(regex="State|Type").values[np.arange(low.shape[0])[:,None], low])
high_value = pd.DataFrame(df.filter(regex="State|Type").values[np.arange(high.shape[0])[:,None], high])
df1 = (high_value - low_value).mul((buckets - low).values) + low_value
df1['Product Type'] = df['Product Type']