I'm trying to make a calculation on multiple rows for every row in a dataframe.
My current solution takes so much time when I run 2971000 rows. it almost takes more than 2hours.
So, I want know other solutions to speed up a function
my data looks like this for example.
sig1 sig2 sig3 sig4 sig_p sig_t
20210114 05:52:02.00 0.0 0.0 0.0 0.0 11.5 -3.5
20210114 05:52:02.01 0.0 0.0 0.0 0.0 11.6 -3.5
20210114 05:52:02.02 0.0 0.0 0.0 0.0 11.5 -3.5
20210114 05:52:02.03 0.0 0.0 0.0 0.0 11.6 -3.5
20210114 05:52:02.04 0.0 0.0 0.0 0.0 11.7 -3.5
... ... ... ... ... ... ...
20210114 22:38:59.85 0.0 0.0 0.0 0.0 0.0 -0.5
20210114 22:38:59.86 0.0 0.0 0.0 0.0 0.0 -0.5
20210114 22:38:59.87 0.0 0.0 0.0 0.0 0.0 -0.5
20210114 22:38:59.88 0.0 0.0 0.0 0.0 0.0 -0.5
20210114 22:38:59.89 0.0 0.0 0.0 0.0 0.0 -0.5
I have a function which loops through and calculates value for newcol based on sig1, sig_p, sig_t,previous newcol. the function runs repeat for sig1, sig2, sig3, sig4.
I'll show you the code I currently have, but it's too slow.
parameter.py
from typing import NamedTuple
class Param(NamedTuple):
RATIO : float
D : float
T : float
M : float
S : float
W : float
DYNAMIC : float
T_CONST : float
P_CONST : float
L_COEF : float
O_COEF : float
#property
def A(self):
return (self.D**2)*math.pi
#property
def FACTOR(self):
return self.S / self.A
Param1 = Param(
RATIO = 0.74,
D = 172e-3,
T = 23e-3,
M = 6,
S = 53.7e-4,#4232.5e-6,
W = 0.805,
DYNAMIC = 0.3150,
T_CONST = 2, #4,
P_CONST = 0.2,#3,
L_COEF = 0.8,#4,
O_COEF = 2.5
)
rear = Param(
RATIO = 0.26,
D = 204e-3,
T = 10e-3,
M = 4,
S = 26.8e-4,
W = 0.38,
DYNAMIC = 0.3150,
T_CONST = 1.8,
P_CONST = 0.2,
L_COEF = 0.2,
O_COEF = 1.8
)
test.py
import pandas as pd
import numpy as np
from scipy.interpolate import interp1d
TIME_STAMP = 0.1
SPEC = 449
SPECIFIC = 935
EMISSIVITY = 0.7
ABSORBTIVITY = 0.3
DYNAMIC_SPEED = 12
COEFFICIENT = 0.9506173967164384
input_KV = [-75, -50, -25, -15, -10, -5, 0, 5, 10, 15, 20, 25, 30, 40, 50, 60,
80, 100, 125, 150, 175, 200, 225, 300, 412, 500, 600, 700, 800, 900, 1000, 1100]
viscosity_value = [7.4, 9.22, 11.18, 12.01, 12.43, 12.85, 13.28, 13.72, 14.16, 14.61, 15.06, 15.52, 15.98, 16.92, 17.88, 18.86, 20.88,
22.97, 25.69, 28.51, 31.44, 34.47, 37.6, 47.54, 63.82, 77.72, 94.62, 112.6, 131.7, 151.7, 172.7, 194.6]
input_ka = [-190, -150, -100, -75, -50, -25, -15, -10, -5, 0, 5, 10, 15, 20, 25, 30, 40,
50, 60, 80, 100, 125, 150, 175, 200, 225, 300, 412, 500, 600, 700, 800, 900, 1000, 1100]
conductivity_value = [7.82, 11.69, 16.2, 18.34, 20.41, 22.41, 23.2, 23.59, 23.97, 24.36, 24.74, 25.12, 25.5, 25.87, 26.24, 26.62,
27.35, 28.08, 28.8, 30.23, 31.62, 33.33, 35, 36.64, 38.25, 39.83, 44.41, 50.92, 55.79, 61.14, 66.32, 71.35, 76.26, 81.08, 85.83]
def viscosity(input):
fq = interp1d(input_KV,
viscosity_value, kind='quadratic')
return (fq(input)*10e-6)
def conductivity(input):
fq = interp1d(input_ka,
conductivity_value, kind='quadratic')
return (fq(input)*10e-3)
def calculation(Param, sig, sig_p, sig_t):
new_col1 = np.empty(len(sig_p))
new_col1[0] = sig_t[0]
my_goal = np.empty(len(sig_p))
my_goal[0] = sig_t[0]
calc1 = COEFFICIENT * Param.RATIO * sig_p * sig /2
for n in range(1, len(sig_p)):
calc2 = EMISSIVITY * Param.A * (new_col1[n-1]**4 - sig_t[n]**4)
Ka = conductivity(sig_t[n])
if sig[n] == 0:
h = Param.O_COEF
else :
KV = viscosity(sig_t[n])
if sig[n] < DYNAMIC_SPEED:
h = (0.7*(sig[n]/KV)**0.4) * Ka + Param.O_COEF
else :
h = (0.04*(sig[n])/KV**0.8) * Ka + Param.L_COEF
calc3 = h * Param.A * (new_col1[n-1] - sig_t[n])
calc4 = Ka *Param.A * (new_col1[n-1] - sig_t[n]) / Param.T
a1 = (calc1[n] - (calc2 + calc3 + calc4)) / (SPEC * Param.M)
new_col1[n] = new_col1[n-1] + a1 * TIME_STAMP
if sig_p[n] == 0 :
val1 = ABSORBTIVITY * Param.FACTOR * calc2
elif (sig_p[n] > 0) & (sig_p[n] <= 20):
val1 = ABSORBTIVITY * Param.FACTOR * calc2* (20-sig_p[n])/20 + ((1-COEFFICIENT) * calc1[n] / (4)) * sig_p[n] / 20
else:
val1 = (1-COEFFICIENT) * calc1[n] / 4
if sig[n] == 0:
val2 = Param.T_CONST
else:
h_bar = Param.P_CONST * (sig[n] *Param.DYNAMIC)**0.8
val2 = h_bar * Param.S * (my_goal[n-1] - sig_t[n])
a2 = (val1 - (val2)) / (SPECIFIC * Param.W)
my_goal[n] = my_goal[n-1] + a2 * TIME_STAMP
if my_goal[n] < sig_t[0] : my_goal[n] = sig_t[0]
return my_goal
df = pd.read_csv('data.csv', index_col=0)
df['newcol1'] = calculation(Param1, df['sig1'].values, df[sig_p].values, df['sig_t'].values)
df['newcol2'] = calculation(Param1, df['sig2'].values, df[sig_p].values, df['sig_t'].values)
df['newcol3'] = calculation(Param2, df['sig3'].values, df[sig_p].values, df['sig_t'].values)
df['newcol4'] = calculation(Param2, df['sig4'].values, df[sig_p].values, df['sig_t'].values)
I now need to apply this function to several million rows and it's impossibly slow so I'm trying to figure out the best way to speed it up. I've heard that Cython can increase the speed of functions but I have no experience with it (and I'm new to both pandas and python).
My question is if there is anyway to enhance or speed up this computation method.
I'm run this python code on AWS(sagemaker>notebook instance, jupyter) and my computer OS is window.
Iteration is easy to code but slow for dataframe. Here is a hint to your solution. You need to vectorize the code inside the while loop while n < len(sig_p):.
For example, previously your code:
def fun(Param, sig_p, sig, sig_t):
tempvalue = np.empty(sig_p.shape)
tempvalue[0] = sig_t[0]
newcol = np.empty(sig_p.shape)
newcol[0] = sig_t[0]
n = 1
while n < len(sig_p):
# calc1 = fun1()
calc1 = Param.COEF * (sig_p[n]) * Param.NO * Param.EFF # fun1()
# calc2 = fun2()
if sig[n] > Param.THRESHOLD:
calc2 = 0
else:
calc2 = Param.EMISSIVITY * Param.CONSTANT * (tempvalue[n-1]**4 - sig_t[n]**4)
# calc3
# calc4
# ......
df['newcol1'] = fun(param1, df['sig_p'].values, df['sig1'].values, df['sig_t'].values)
To eliminate the while loop, the fun1() and fun2 can be rewritten like this:
def fun(Param, df, sigTag):
# df['calc1'] = vectorized fun1()
df['calc1'] = Param.COEF * df['sig_p'] * Param.NO * Param.EFF
# df['calc2'] = vectorized fun2()
df['calc2'] = Param.EMISSIVITY * Param.CONSTANT * (df['sig_t'].shift(1)**4 - df['sig_t']**4)
df.loc[df[sigTag] > Param.THRESHOLD, 'calc2'] = 0
# df['calc3'] = vectorized fun3()
# df['calc4'] = vectorized fun4()
# ......
df['newcol1'] = fun(param1, df, 'sig1')
you might also want to input the dataframe to the fun() rather than in separated ndarray(s).
This approach'd greatly enhance the performance. You might want to do some research on how to vectorize the calculation.
Related
I want to perform desired computations based in my 'x' and 'y' coordinate values as in below table (Table 1):
TIMESTEP nparticles v_x v_y radius area_sum vx_a1 vy_a1 phi_1
0 1 0.0 0.0 0.490244 0.7550478008000959 1.90579 -1.83605 0.36630
100 1 0.369944 0.196252 0.490244 0.7550478008000959
200 1 -0.110178 -0.233131 0.490244 0.7550478008000959
...
...
97400 1 -1.03617 -7.24768 0.461981 0.6704989496863082
97500 1 -1.30016 -7.25768 0.461981 0.6704989496863082
...
...
For which I am using this code for my above generated dataframe:
bindistance = 0.25
orfl = -4.0
orfr = 4.0
bin_xc = np.arange(orfl, orfr, bindistance)
nbins = len(bin_xc)
binx = 0.25
xo_min = -4.0
xo_max = 4.0
xb1_c = xo_min
xb1_max = xb1_c + (binx * 2)
xb1_min = xb1_c - (binx * 2)
yb_min = -0.5
yb_max = 0.5
yb_c = 0
x_particle1 = df.loc[(df['x'] < xb1_max) &
(df['x'] > xb1_min)]
xy_particle1 =
x_particle1.loc[(x_particle1['y'] < yb_max)
& (x_particle1['y'] > yb_min)]
output1 = xy_particle1.groupby("TIMESTEP").agg(nparticles = ("id", "count"), v_x=("vx", "sum"), v_y=("vy", "sum"), radius = ("radius", "sum"), area_sum = ("Area", "sum"))
nsum1 = output1["nparticles"].sum()
vxsum1 = output1["v_x"].sum()
vysum1 = output1["v_y"].sum()
v_a1 = vxsum1 / nsum1
vy_a1 = vysum1 / nsum1
phi_1 = output1["area_sum"].sum() / 1001
But I am having very large number of desired dataframes (first dataframe is above shown) based in my 'x' and 'y' coordinate conditions. So manually writing each code 50 or more times is not feasible. How I can do with using a loop or otherwise? Please help
This is my input dataset (df):
TIMESTEP id radius x y vx vy Area
0 42 0.490244 -3.85683 0.489375 0.0 0.0 0.7550478008000959
0 245 0.479994 -2.88838 0.479446 0.0 0.0 0.7238048519265009
0 344 0.463757 -1.94613 0.463363 0.0 0.0 0.6756640757454175
0 313 0.503268 -0.981364 0.501991 0.0 0.0 0.7956984398459999
...
...
100000 1051 0.542993 0.887743 1.71649 -0.309668 -5.83282 0.9262715700848821
100000 504 0.540275 2.87158 1.94939 -5.76545 -2.30889 0.9170217083878441
100000 589 0.450005 3.86868 1.89373 -4.49676 -2.63977 0.636186649597414
...
...
I would like to simulate individual changes in growth and mortality for a variable number of days. My dataframe is formatted as follows...
import pandas as pd
data = {'unique_id': ['2', '4', '5', '13'],
'length': ['27.7', '30.2', '25.4', '29.1'],
'no_fish': ['3195', '1894', '8', '2774'],
'days_left': ['253', '253', '254', '256'],
'growth': ['0.3898', '0.3414', '0.4080', '0.3839']
}
df = pd.DataFrame(data)
print(df)
unique_id length no_fish days_left growth
0 2 27.7 3195 253 0.3898
1 4 30.2 1894 253 0.3414
2 5 25.4 8 254 0.4080
3 13 29.1 2774 256 0.3839
Ideally, I would like the initial length (i.e., length) to increase by the daily growth rate (i.e., growth) for each of the days remaining in the year (i.e., days_left).
df['final'] = df['length'] + (df['days_left'] * df['growth']
However, I would also like to update the number of fish that each individual represents (i.e., no_fish) on a daily basis using a size-specific equation. I'm fairly new to python so I initially thought to use a for-loop (I'm not sure if there is another, more efficient way). My code is as follows:
# keep track of run time - START
start_time = time.perf_counter()
df['z'] = 0.0
for indx in range(len(df)):
count = 1
while count <= int(df.days_to_forecast[indx]):
# (1) update individual length
df.lgth[indx] = df.lgth[indx] + df.linearGR[indx]
# (2) estimate daily size-specific mortality
if df.lgth[indx] > 50.0:
df.z[indx] = 0.01
else:
if df.lgth[indx] <= 50.0:
df.z[indx] = 0.052857-((0.03/35)*df.lgth[indx])
elif df.lgth[indx] < 15.0:
df.z[indx] = 0.728*math.exp(-0.1892*df.lgth[indx])
df['no_fish'].round(decimals = 0)
if df.no_fish[indx] < 1.0:
df.no_fish[indx] = 0.0
elif df.no_fish[indx] >= 1.0:
df.no_fish[indx] = df.no_fish[indx]*math.exp(-(df.z[indx]))
# (3) reduce no. of days left in forecast by 1
count = count + 1
# keep track of run time - END
total_elapsed_time = round(time.perf_counter() - start_time, 2)
print("Forecast iteration completed in {} seconds".format(total_elapsed_time))
The above code now works correctly, but it is still far to inefficient to run for 40,000 individuals each for 200+ days.
I would really appreciate any advice on how to modify the following code to make it pythonic.
Thanks
Another option that was suggested to me is to use the pd.dataframe.apply function. This dramatically reduced the overall the run time and could be useful to someone else in the future.
### === RUN SIMULATION === ###
start_time = time.perf_counter() # keep track of run time -- START
#-------------------------------------------------------------------------#
def function_to_apply( df ):
df['z_instantMort'] = ''
for indx in range(int(df['days_left'])):
# (1) update individual length
df['length'] = df['length'] + df['growth']
# (2) estimate daily size-specific mortality
if df['length'] > 50.0:
df['z_instantMort'] = 0.01
else:
if df['length'] <= 50.0:
df['z_instantMort'] = 0.052857-((0.03/35)*df['length'])
elif df['length'] < 15.0:
df['z_instantMort'] = 0.728*np.exp(-0.1892*df['length'])
whole_fish = round(df['no_fish'], 0)
if whole_fish < 1.0:
df['no_fish'] = 0.0
elif whole_fish >= 1.0:
df['no_fish'] = df['no_fish']*np.exp(-(df['z_instantMort']))
return df
#-------------------------------------------------------------------------#
sim_results = df.apply(function_to_apply, axis=1)
total_elapsed_time = round(time.perf_counter() - start_time, 2) # END
print("Forecast iteration completed in {} seconds".format(total_elapsed_time))
print(sim_results)
### ====================== ###
output being...
Forecast iteration completed in 0.05 seconds
unique_id length no_fish days_left growth z_instantMort
0 2.0 126.3194 148.729190 253.0 0.3898 0.01
1 4.0 116.5742 93.018465 253.0 0.3414 0.01
2 5.0 129.0320 0.000000 254.0 0.4080 0.01
3 13.0 127.3784 132.864757 256.0 0.3839 0.01
As I said in my comment, a preferable alternative to for loops in this setting is using vector operations. For instance, running your code:
import pandas as pd
import time
import math
import numpy as np
data = {'unique_id': [2, 4, 5, 13],
'length': [27.7, 30.2, 25.4, 29.1],
'no_fish': [3195, 1894, 8, 2774],
'days_left': [253, 253, 254, 256],
'growth': [0.3898, 0.3414, 0.4080, 0.3839]
}
df = pd.DataFrame(data)
print(df)
# keep track of run time - START
start_time = time.perf_counter()
df['z'] = 0.0
for indx in range(len(df)):
count = 1
while count <= int(df.days_left[indx]):
# (1) update individual length
df.length[indx] = df.length[indx] + df.growth[indx]
# (2) estimate daily size-specific mortality
if df.length[indx] > 50.0:
df.z[indx] = 0.01
else:
if df.length[indx] <= 50.0:
df.z[indx] = 0.052857-((0.03/35)*df.length[indx])
elif df.length[indx] < 15.0:
df.z[indx] = 0.728*math.exp(-0.1892*df.length[indx])
df['no_fish'].round(decimals = 0)
if df.no_fish[indx] < 1.0:
df.no_fish[indx] = 0.0
elif df.no_fish[indx] >= 1.0:
df.no_fish[indx] = df.no_fish[indx]*math.exp(-(df.z[indx]))
# (3) reduce no. of days left in forecast by 1
count = count + 1
# keep track of run time - END
total_elapsed_time = round(time.perf_counter() - start_time, 2)
print("Forecast iteration completed in {} seconds".format(total_elapsed_time))
print(df)
with output:
unique_id length no_fish days_left growth
0 2 27.7 3195 253 0.3898
1 4 30.2 1894 253 0.3414
2 5 25.4 8 254 0.4080
3 13 29.1 2774 256 0.3839
Forecast iteration completed in 31.75 seconds
unique_id length no_fish days_left growth z
0 2 126.3194 148.729190 253 0.3898 0.01
1 4 116.5742 93.018465 253 0.3414 0.01
2 5 129.0320 0.000000 254 0.4080 0.01
3 13 127.3784 132.864757 256 0.3839 0.01
Now with vector operations, you could do something like:
# keep track of run time - START
start_time = time.perf_counter()
df['z'] = 0.0
for day in range(1, df.days_left.max() + 1):
update = day <= df['days_left']
# (1) update individual length
df[update]['length'] = df[update]['length'] + df[update]['growth']
# (2) estimate daily size-specific mortality
df[update]['z'] = np.where( df[update]['length'] > 50.0, 0.01, 0.052857-( ( 0.03 / 35)*df[update]['length'] ) )
df[update]['z'] = np.where( df[update]['length'] < 15.0, 0.728 * np.exp(-0.1892*df[update]['length'] ), df[update]['z'] )
df[update]['no_fish'].round(decimals = 0)
df[update]['no_fish'] = np.where(df[update]['no_fish'] < 1.0, 0.0, df[update]['no_fish'] * np.exp(-(df[update]['z'])))
# keep track of run time - END
total_elapsed_time = round(time.perf_counter() - start_time, 2)
print("Forecast iteration completed in {} seconds".format(total_elapsed_time))
print(df)
with output
Forecast iteration completed in 1.32 seconds
unique_id length no_fish days_left growth z
0 2 126.3194 148.729190 253 0.3898 0.0
1 4 116.5742 93.018465 253 0.3414 0.0
2 5 129.0320 0.000000 254 0.4080 0.0
3 13 127.3784 132.864757 256 0.3839 0.0
I'm currently using a specific FEM software. The post-processing tool is quite outdated, and it can run only on a dedicated machine. I want to visualize some of the results on my own laptop (for better presentation), using the result files the software produces. I'm using the Pandas library with Python.
I was able to get to the point where I have two different DataFrames, one with the element ID and the nodes that construct it, and the second with nodes ID, and x,y coordinates -
elementDF - includes {index, element ID, node1, node2, node3} # elements have 3 nodes
coordsDF - includes {index, node ID, x, y}
and I was able to combine the two into a single DataFrame -
df - includes {index, element ID, x1, y1, x2, y2, x3, y3} # where x1 and y1 are the
coordinates of node1, etc
I will later use this DataFrame to build polygons and visualize the mesh.
The thing is, I believe I used a very costly loop to search for each node by its ID, extract the x & y coordinates, and then combine everything. I know this because the dedicated post-processing program does that in a few seconds (for a large mesh - 10,000 elements or more) and mine takes around 40~60 seconds for the same number of elements. I would like to know if there is a quicker and more efficient way to construct the final DataFrame.
Sample input DataFrames:
elementDF = pd.DataFrame({
'element': [1,2,3,4,5,6,7,8,9,10],
'node1': [2,33,33,32,183,183,183,185,185,36],
'node2': [34,34,183,183,34,35,185,35,36,37],
'node3': [33,183,32,184,35,185,186,36,187,187]
})
coordsDF = pd.DataFrame({
'node': [2,32,33,34,35,36,37,183,184,185,186,187],
'x': [-1, 1, 1, -1, -1.1, 1.1, 1.1, -1.1, -1.1, 1.1, 2, 2.2],
'y': [0,0,2,2,-0.2,-0.2,0,0,2,2, 4, 4.4]
})
Sample code:
import pandas as pd
def extractXY(nodeNumber,df):
# extract x,y data from node location
nodeData = df.loc[df['node'] == nodeNumber]
x = nodeData.x
y = nodeData.y
return x, y
#main#
df = pd.DataFrame(columns = ['x1','y1','x2','y2','x3','y3'])
for i in range(len(elementDF)):
nodeNumber1 = elementDF.loc[i].node1
x1, y1 = extractXY(nodeNumber1, coordsDF)
nodeNumber2 = elementDF.loc[i].node2
x2, y2 = extractXY(nodeNumber2, coordsDF)
nodeNumber3 = elementDF.loc[i].node3
x3, y3 = extractXY(nodeNumber3, coordsDF)
df = df.append({'x1': float(x1), 'y1': float(y1),
'x2': float(x2), 'y2': float(y2) ,
'x3': float(x3), 'y3': float(y3)}, ignore_index = True)
df = pd.concat([elementDF['element'],df], axis = 1)
Let's try this:
import pandas as pd
elementDF = pd.DataFrame({
'element': [1,2,3,4,5,6,7,8,9,10],
'node1': [2,33,33,32,183,183,183,185,185,36],
'node2': [34,34,183,183,34,35,185,35,36,37],
'node3': [33,183,32,184,35,185,186,36,187,187]
})
coordsDF = pd.DataFrame({
'node': [2,32,33,34,35,36,37,183,184,185,186,187],
'x': [-1, 1, 1, -1, -1.1, 1.1, 1.1, -1.1, -1.1, 1.1, 2, 2.2],
'y': [0,0,2,2,-0.2,-0.2,0,0,2,2, 4, 4.4]
})
mapx = coordsDF.set_index('node')['x']
mapy = coordsDF.set_index('node')['y']
df = pd.concat([
elementDF.set_index('element').replace(mapx).rename(columns=lambda x: x.replace('node','x')),
elementDF.set_index('element').replace(mapy).rename(columns=lambda y: y.replace('node','y')),
],
axis=1)
df
Output:
x1 x2 x3 y1 y2 y3
element
1 -1.0 -1.0 1.0 0.0 2.0 2.0
2 1.0 -1.0 -1.1 2.0 2.0 0.0
3 1.0 -1.1 1.0 2.0 0.0 0.0
4 1.0 -1.1 -1.1 0.0 0.0 2.0
5 -1.1 -1.0 -1.1 0.0 2.0 -0.2
6 -1.1 -1.1 1.1 0.0 -0.2 2.0
7 -1.1 1.1 2.0 0.0 2.0 4.0
8 1.1 -1.1 1.1 2.0 -0.2 -0.2
9 1.1 1.1 2.2 2.0 -0.2 4.4
10 1.1 1.1 2.2 -0.2 0.0 4.4
I am calculating the standard deviation of the rolling mean (Bollinger Bands, example here is very simplified) in a pandas dataframe like this:
import pandas as pd
import numpy as np
no_of_std = 3
window = 20
df = pd.DataFrame({'A': [34, 34, 34, 33, 32, 34, 35.0, 21, 22, 25, 23, 21, 39, 26, 31, 34, 38, 26, 21, 39, 31]})
rolling_mean = df['A'].rolling(window).mean()
rolling_std = df['A'].rolling(window).std(ddof=0)
df['M'] = rolling_mean
df['BBL'] = rolling_mean - (rolling_std * no_of_std)
df['BBH'] = rolling_mean + (rolling_std * no_of_std)
print (df)
The result looks like this:
A M BBL BBH
0 34.0 NaN NaN NaN
1 34.0 NaN NaN NaN
2 34.0 NaN NaN NaN
3 33.0 NaN NaN NaN
4 32.0 NaN NaN NaN
5 34.0 NaN NaN NaN
6 35.0 NaN NaN NaN
7 21.0 NaN NaN NaN
8 22.0 NaN NaN NaN
9 25.0 NaN NaN NaN
10 23.0 NaN NaN NaN
11 21.0 NaN NaN NaN
12 39.0 NaN NaN NaN
13 26.0 NaN NaN NaN
14 31.0 NaN NaN NaN
15 34.0 NaN NaN NaN
16 38.0 NaN NaN NaN
17 26.0 NaN NaN NaN
18 21.0 NaN NaN NaN
19 39.0 30.10 11.633544 48.566456
20 31.0 29.95 11.665375 48.234625
Now i want to calculate in the other direction which value the last value in the column 'A' needs to have to hit exactly the 3rd standard deviation of the rolling mean.
That means in other words i want to calculate: which value needs A to have in a next row nr.15 that it will be exactly the same as the value in BBH or BBL.
I can do this by recursive approximation but this needs a lot of perfomance and i think there must be a better way. Here is an example for the solution from which i think it is to slow and there must be a better faster way:
import pandas as pd
odf = pd.DataFrame({'A': [34, 34, 34, 33, 32, 34, 35.0, 21, 22, 25, 23, 21, 39, 26, 31, 34, 38, 26, 21, 39, 31]})
def get_last_bbh_bbl(idf):
xdf = idf.copy()
no_of_std = 3
window = 20
rolling_mean = xdf['A'].rolling(window).mean()
rolling_std = xdf['A'].rolling(window).std()
xdf['M'] = rolling_mean
xdf['BBL'] = rolling_mean - (rolling_std * no_of_std)
xdf['BBH'] = rolling_mean + (rolling_std * no_of_std)
bbh = xdf.loc[len(xdf) - 1, 'BBH']
bbl = xdf.loc[len(xdf) - 1, 'BBL']
return bbh, bbl
def search_matching_value(idf, low, high, search_for):
xdf = idf.copy()
if abs(high-low) < 0.000001:
return high
middle = low + ((high-low)/2)
xdf = xdf.append({'A' : middle}, ignore_index=True)
bbh, bbl = get_last_bbh_bbl(xdf)
if search_for == 'bbh':
if bbh < middle:
result=search_matching_value(idf, low, middle, search_for)
elif bbh > middle:
result=search_matching_value(idf, middle, high, search_for)
else:
return middle
elif search_for == 'bbl':
if bbl > middle:
result=search_matching_value(idf, middle, high, search_for)
elif bbl < middle:
result=search_matching_value(idf, low, middle, search_for)
else:
return middle
return result
actual_bbh, actual_bbl = get_last_bbh_bbl(odf)
last_value = odf.loc[len(odf) - 1, 'A']
print('last_value: {}, actual bbh: {}, actual bbl: {}'.format(last_value, actual_bbh, actual_bbl))
low = last_value
high = actual_bbh * 10
next_value_that_hits_bbh = search_matching_value(odf, low, high, 'bbh')
print ('next_value_that_hits_bbh: {}'.format(next_value_that_hits_bbh))
low=0
high=last_value
next_value_that_hits_bbl = search_matching_value(odf, low, high, 'bbl')
print ('next_value_that_hits_bbl: {}'.format(next_value_that_hits_bbl))
the result looks like this:
last_value: 31.0, actual bbh: 48.709629106422284, actual bbl: 11.190370893577711
next_value_that_hits_bbh: 57.298733206475276
next_value_that_hits_bbl: 2.174952656030655
here one solution to calculate next value with fast algorithm: newton opt and newton classic are faster than dichotomy and this solution dont use dataframe to recalculate the different value, i use directly the statistic function from the library of same name
some info for scipy.optimize.newton
from scipy import misc
import pandas as pd
import statistics
from scipy.optimize import newton
#scipy.optimize if you want to test the newton optimized function
def get_last_bbh_bbl(idf):
xdf = idf.copy()
rolling_mean = xdf['A'].rolling(window).mean()
rolling_std = xdf['A'].rolling(window).std()
xdf['M'] = rolling_mean
xdf['BBL'] = rolling_mean - (rolling_std * no_of_std)
xdf['BBH'] = rolling_mean + (rolling_std * no_of_std)
bbh = xdf.loc[len(xdf) - 1, 'BBH']
bbl = xdf.loc[len(xdf) - 1, 'BBL']
lastvalue = xdf.loc[len(xdf) - 1, 'A']
return lastvalue, bbh, bbl
#classic newton
def NewtonsMethod(f, x, tolerance=0.00000001):
while True:
x1 = x - f(x) / misc.derivative(f, x)
t = abs(x1 - x)
if t < tolerance:
break
x = x1
return x
#to calculate the result of function bbl(x) - x (we want 0!)
def low(x):
l = lastlistofvalue[:-1]
l.append(x)
avg = statistics.mean(l)
std = statistics.stdev(l, avg)
return avg - std * no_of_std - x
#to calculate the result of function bbh(x) - x (we want 0!)
def high(x):
l = lastlistofvalue[:-1]
l.append(x)
avg = statistics.mean(l)
std = statistics.stdev(l, avg)
return avg + std * no_of_std - x
odf = pd.DataFrame({'A': [34, 34, 34, 33, 32, 34, 35.0, 21, 22, 25, 23, 21, 39, 26, 31, 34, 38, 26, 21, 39, 31]})
no_of_std = 3
window = 20
lastlistofvalue = odf['A'].shift(0).to_list()[::-1][:window]
"""" Newton classic method """
x = odf.loc[len(odf) - 1, 'A']
x0 = NewtonsMethod(high, x)
print(f'value to hit bbh: {x0}')
odf = pd.DataFrame({'A': [34, 34, 34, 33, 32, 34, 35.0, 21, 22, 25, 23, 21, 39, 26, 31, 34, 38, 26, 21, 39, 31, x0]})
lastvalue, new_bbh, new_bbl = get_last_bbh_bbl(odf)
print(f'value to hit bbh: {lastvalue} -> check new bbh: {new_bbh}')
x0 = NewtonsMethod(low, x)
print(f'value to hit bbl: {x0}')
odf = pd.DataFrame({'A': [34, 34, 34, 33, 32, 34, 35.0, 21, 22, 25, 23, 21, 39, 26, 31, 34, 38, 26, 21, 39, 31, x0]})
lastvalue, new_bbh, new_bbl = get_last_bbh_bbl(odf)
print(f'value to hit bbl: {lastvalue} -> check new bbl: {new_bbl}')
output:
value to hit bbh: 57.298732375228624
value to hit bbh: 57.298732375228624 -> check new bbh: 57.29873237527272
value to hit bbl: 2.1749518354059636
value to hit bbl: 2.1749518354059636 -> check new bbl: 2.1749518353102992
you could compare the newton optimized like:
""" Newton optimized method """
x = odf.loc[len(odf) - 1, 'A']
x0 = newton(high, x, fprime=None, args=(), tol=1.00e-08, maxiter=50, fprime2=None)
print(f'Newton opt value to hit bbh: {x0}')
x0 = newton(low, x, fprime=None, args=(), tol=1.48e-08, maxiter=50, fprime2=None)
print(f'Newton value to hit bbl: {x0}')
output:
Newton opt value to hit bbh: 57.29873237532118
Newton value to hit bbl: 2.1749518352051225
with the newton optimized, you could play with the max iteration
and optimized is faster than classic:
measures for each calculus
0.002 sec for optimized
0.005 sec for classic
*Remarks: *
if you use rolling(window).std() you are using the standard deviation so you have to use
std = statistics.stdev(l, avg) you divide by N-1 items
if you use rolling(window).std(ddof=0) you are using the population deviation so you have to use
std = statistics.pstdev(l, avg) you divide by N items
How can I create a new column in a pandas.DataFfame with a returned value from a scipy function? The scipy.optimize function calls another function to determine a value. I'm able to print the returned values, validating the functionality, but I'm unable to store the returned values in a new pandas column.
# import packages
import pandas as pd
from math import sqrt, log, exp
from scipy.stats import norm
from scipy import optimize
# define variables
tradingMinutesDay = 390.0
tradingMinutesAnnum = 98280.0
# create pandas.DataFrame
df = pd.DataFrame.from_dict({'CP': [1, -1, 1, -1],
'M': [1.705, 1.305, 2.45, 1.995],
'RF': [0.008671, 0.008671, 0.009290, 0.009290],
'K': [60.0, 60.0, 60.0, 60.0],
'T': [33.0, 33.0, 53.0, 53.0],
'S': [60.4, 60.4, 60.4, 60.4]})
# def function
def find_sigma2(sigma, mark, cp, S, K, dte, rf):
T = (dte * tradingMinutesDay) / tradingMinutesAnnum
q = 0.0
log_SK = log(S / K)
sqrt_T = sqrt(T)
drf = exp(-rf * T)
dq = exp(-q*T)
d1 = (log_SK + T * (rf - q + sigma ** 2 / 2)) / (sigma * sqrt_T)
d2 = d1 - sigma * sqrt_T
cdf_d1 = norm.cdf(cp * d1)
cdf_d2 = norm.cdf(cp * d2)
return cp * ((S * dq * cdf_d1) - (K * drf * cdf_d2)) - mark
I'm able to run the functions and print the values:
# Can print accurate values
for r in df.itertuples():
print(optimize.brentq(find_sigma2, .0001, 10, args=(r.M, r.CP, r.S, r.K, r.T, r.RF), xtol=1.0e-4))
0.16798850071790686
0.17589393607434
0.19833696082012875
0.2040142964775614
I'm unable to store the values using the methods below.
# TypeError: cannot convert the series to <class 'float'>
df['IV'] = df.apply(optimize.brentq(find_sigma2, .0001, 10, args=(df.M, df.CP, df.S, df.K, df.T, df.RF), xtol=1.0e-4), axis=1)
# AttributeError: 'Pandas' object has no attribute 'IV'
for r in df.itertuples():
r.IV = optimize.brentq(find_sigma2, .0001, 10, args=(r.M, r.CP, r.S, r.K, r.T, r.RF), xtol=1.0e-4)
# AttributeError: can't set attribute
df['IV'] = 0
for r in df.itertuples():
r.IV = optimize.brentq(find_sigma2, .0001, 10, args=(r.M, r.CP, r.S, r.K, r.T, r.RF), xtol=1.0e-4)
# TypeError: cannot convert the series to <class 'float'>
for i, r in df.iterrows():
r.IV = optimize.brentq(find_sigma2, .0001, 10, args=(r.M, r.CP, r.S, r.K, r.T, r.RF), xtol=1.0e-4)
# TypeError: cannot convert the series to <class 'float'>
for i, r in df.iterrows():
df.set_value(i, r, (optimize.brentq(find_sigma2, .0001, 10, args=(r.M, r.CP, r.S, r.K, r.T, r.RF), xtol=1.0e-4)))
The expected output:
CP K M RF S T IV
0 1 60.0 1.705 0.008671 60.4 33.0 0.167989
1 -1 60.0 1.305 0.008671 60.4 33.0 0.175894
2 1 60.0 2.450 0.009290 60.4 53.0 0.198337
3 -1 60.0 1.995 0.009290 60.4 53.0 0.204014
Any ideas?
Option 1
brute force
iv = [
optimize.brentq(
find_sigma2, .0001, 10, args=(r.M, r.CP, r.S, r.K, r.T, r.RF), xtol=1.0e-4
) for r in df.itertuples()
]
df.assign(IV=iv)
CP K M RF S T IV
0 1 60.0 1.705 0.008671 60.4 33.0 0.167989
1 -1 60.0 1.305 0.008671 60.4 33.0 0.175894
2 1 60.0 2.450 0.009290 60.4 53.0 0.198337
3 -1 60.0 1.995 0.009290 60.4 53.0 0.204014
Option 2
more brute force
for r in df.itertuples():
df.set_value(
r.Index, 'IV',
optimize.brentq(
find_sigma2, .0001, 10, args=(r.M, r.CP, r.S, r.K, r.T, r.RF), xtol=1.0e-4
)
)
df
CP K M RF S T IV
0 1 60.0 1.705 0.008671 60.4 33.0 0.167989
1 -1 60.0 1.305 0.008671 60.4 33.0 0.175894
2 1 60.0 2.450 0.009290 60.4 53.0 0.198337
3 -1 60.0 1.995 0.009290 60.4 53.0 0.204014
The thing that is likely tripping you up is the column named T. Using .T will give you the transpose of the Series not the element named T. So something like this will work:
Code:
def run_brentq(r):
return optimize.brentq(
find_sigma2, .0001, 10,
args=(r.M, r.CP, r.S, r.K, r['T'], r.RF),
xtol=1.0e-4)
df['IV'] = df.apply(run_brentq, axis=1)
print(df)
Results:
CP K M RF S T IV
0 1 60.0 1.705 0.008671 60.4 33.0 0.167989
1 -1 60.0 1.305 0.008671 60.4 33.0 0.175894
2 1 60.0 2.450 0.009290 60.4 53.0 0.198337
3 -1 60.0 1.995 0.009290 60.4 53.0 0.204014
I used the answer presented above with the following change to accommodate records with bad data:
def run_brentq(r):
try:
return optimize.brentq(
find_sigma2, .0001, 10,
args=(r.M, r.CP, r.S, r.K, r['T'], r.RF),
xtol=1.0e-4)
except:
return 0
df['IV'] = df.apply(run_brentq, axis=1)
print(df)