Optimizing performance of SciPy Minimize while using Concurrent.Futures? - python

I'm trying to figure out how to speed up a scipy.minimize function.
minimize() is called thousands of times. I run it in parallel using a ProcessPoolExecutor. bootstrap() is the parent function.
def optimize_weights(forecasts, prices, instrument):
guess = [1/forecasts.shape[1]] * forecasts.shape[1]
bounds = [(0.0,1.0)] * forecasts.shape[1]
cons = {'type': 'eq', 'fun': lambda x: 1 - sum(x)}
def function(w, forecasts, prices, instrument):
wf = (w*forecasts).mean(axis=1)
wf = wf*10/wf.std()
wf = wf.clip(-20,20)
l = accountCurve(wf, prices, slippage=instrument.slippage, per_contract_cost=instrument.per_contract_cost)
return -l.sharpe()
result = minimize(function, guess, (forecasts, prices, instrument), bounds=bounds, method='SLSQP', tol=0.0001, constraints=cons, options={'disp': False ,'eps' : 1e0})
return result.x
def mp_optimize_weights(samples, prices, instrument):
with ProcessPoolExecutor() as executor:
return executor.map(partial(optimize_weights, prices=prices, instrument=instrument), samples)
def bootstrap(instrument, parallel_process=True):
print("Parallel Process: ", parallel_process)
forecasts = instrument.forecasts().dropna()
prices = instrument.prices().reset_index('Contract', drop=True)
prices = prices[forecasts.index]
years = list(set(prices.index.year))
years.sort()
result={}
for year in years:
sample_length = np.int(prices[:str(year)].size/10)
end_of_sample_selection_space = prices[:str(year)].tail(1).index[0] - pd.Timedelta(days=sample_length)
sample_dates = pd.to_datetime(np.random.choice(prices[:end_of_sample_selection_space].index,100))
if(sample_length > 50):
samples = [forecasts.loc[date:date+pd.Timedelta(days=sample_length)] for date in sample_dates]
if parallel_process is True:
weights = pd.DataFrame(list(mp_optimize_weights(samples, prices[:str(year)], instrument=instrument)))
else:
weights = pd.DataFrame(list(map(partial(optimize_weights, prices=prices[:str(year)], instrument=instrument), samples)))
if len(weights)<2:
print('Weights error')
break
result[year]=weights.mean()
print(year, sample_length)
output = pd.DataFrame.from_dict(result).transpose()
output.columns = forecasts.columns
pl.plot(output)
display.clear_output(wait=True)
display.display(pl.gcf())
return output
import numpy as np
import pandas as pd
class accountCurve():
def __init__(self, forecasts, prices, annual_volatility_target=0.25, multiplier = 1, per_contract_cost = 0, slippage = 0, capital=100000, costs=True):
if prices.index.names[0] == 'Contract':
prices = prices.reset_index('Contract', drop=True)
#Adjust for contract multiplier/pricing in pennies
prices = prices*multiplier/100
self.prices = prices
daily_volatility_target = annual_volatility_target/np.sqrt(252)
instrument_value_volatility = prices.diff().ewm(span=36, min_periods=36).std()
self.notional_position = (forecasts * daily_volatility_target * capital).divide(10.0 * instrument_value_volatility[forecasts.index], axis=0)
self.notional_position.dropna(inplace=True)
#Chunk trades to be at least 10% move in position (to reduce trading costs)
self.position = chunk_trades(self.notional_position)
#Round positions to integers
self.position = np.around(self.notional_position)
#self.position.mark_to_market = self.position.multiply(prices, axis=0)
self.trades = self.position.diff()
#Calculate returns
self.gross_returns = (prices.diff().shift(-1)*self.position).dropna()
if costs:
self.costs = self.trades.abs() * per_contract_cost
self.slippage = self.trades.abs() * slippage * multiplier/100
self.returns = (self.gross_returns - self.slippage - self.costs).dropna()
else:
self.returns = self.gross_returns
def sharpe(self):
return self.returns.mean()/self.returns.std()*np.sqrt(252)
def losses(self):
return [z for z in self.returns if z<0]
def sortino(self):
returns = self.returns.pct_change()
return returns.mean()/np.std(losses(returns))*np.sqrt(252)
def plot(self):
self.returns.cumsum().plot()
def chunk_trades(A):
#Take a list of notional positions and filter so that trades are only greater than 10% of notional position
last = A[0]
new = []
for x in A.iteritems():
if np.abs((x[1]-last)/last) > 0.1:
new.append(x[1])
last = x[1]
else:
new.append(last)
s = pd.Series(new, index=A.index)
return s
On my data, this takes around 45 minutes to run.
I'd like to know:
Is my approach to parallel processing correct? Should I be using threads instead of processes?
Can I reconfigure minimize to finish faster? This is bootstrapping, which is a monte-carlo based sampling method, may not require such an accurate result.
Anything else I can do to speed it up?
In an ideal world, I'd like to speed it up an order of magnitude.

Related

Fitting model to data using scipy differential evolution: "RuntimeError: The map-like callable must be of the form f(func, iterable)..."

I am trying to fit a model to data (extracted from an Excel file and imported using pandas), using a likelihood method. However, when running the code I get a "RuntimeError: The map-like callable must be of the form f(func, iterable), returning a sequence of numbers the same length as 'iterable'" error, which occurred at the "result_simul_G = minimize(negLogLike, params, method = 'differential_evolution', args=(x, y),)" line. Below I have my code; it's very integrated so I couldn't find a way to illustrate what's happening without showing most of it.
#================================================================================
import numpy as np
import pandas as pd
import os
from lmfit import minimize, Parameters, Parameter, report_fit
params = Parameters()
params.add('gamma', value=.45, min=0, max=1, vary = True)
params.add('n', value = 1, min=0, max=3, vary = True)
filename = 'data.xlsx'
#================================================================================
def negLogLike(params, xData, yData):
new_xData = []
new_yData = []
for i in range(len(yData)):
if ((yData[i] != 0) and (xData[i] != 0)):
new_xData.append(xData[i])
new_yData.append(yData[i])
model_result = model(new_xData, params)
nll = 0
epsilon = 10**-10
for i in range(len(new_yData)):
if (model_result[i] < epsilon):
model_result[i] = epsilon
if (model_result[i] > 1 - epsilon):
model_result[i] = 1 - epsilon
nll += new_yData[i] * np.log(model_result[i]) + (1 - new_yData[i]) * np.log(1 - model_result[i])
return -nll
#================================================================================
def model(x, params):
try: # Get parameters
g = params['gamma'].value
n = params['n'].value
except KeyError:
g, n = params
y = 1 - np.exp(-g * x**n)
return y
#================================================================================
def GetFits(DataFrame):
cell_count = 2300000
GFP_GC_SIMUL = np.ones(DataFrame.shape[0], float)
GFP_IC_SIMUL = np.ones(DataFrame.shape[0], float)
# Data
for i in range(DataFrame.shape[0]):
GFP_GC_SIMUL[i] = DataFrame.loc[i, 'GFP genomes'] / cell_count
GFP_IC_SIMUL[i] = DataFrame.loc[i, 'GFP IU'] / cell_count
x = np.array(GFP_GC_SIMUL[10:-10])
y = np.array(GFP_IC_SIMUL[10:-10])
print('len=', len(x), x.dtype, ', x=', x)
print('------------------------')
print('len=', len(y), y.dtype, ', y=', y)
result_simul_G = minimize(negLogLike, params, method = 'differential_evolution', args=(x, y),)
#================================================================================
DataFrame = pd.read_excel('data.xlsx', engine='openpyxl')
GetFits(DataFrame)
When debugging on my own I used print statements to see what x and y data was being supplied to the minimizer and this is what it showed:
len= 34 float64 , x= [0.14478261 0.28695652 0.28695652 0.28695652 0.57391304 0.57391304
0.57391304 0.8738913 0.8738913 0.8738913 1.16086957 1.16086957
1.16086957 1.44780435 1.44780435 1.44780435 1.73478261 1.73478261
1.73478261 2.03476087 2.03476087 2.03476087 2.32173913 2.32173913
2.32173913 2.60869565 2.60869565 2.60869565 2.86956522 2.86956522
2.86956522 7.17391304 7.17391304 7.17391304]
------------------------
len= 34 float64 , y= [0.005 0.01180435 0.01226087 0.01158696 0.036 0.03704348
0.03467391 0.07030435 0.06556522 0.07567391 0.1001087 0.09852174
0.0986087 0.13626087 0.13978261 0.13956522 0.16847826 0.16408696
0.19391304 0.1945 0.21319565 0.19052174 0.32204348 0.23330435
0.25028261 0.28136957 0.26293478 0.25893478 0.28273913 0.29717391
0.273 0.60826087 0.60834783 0.59482609]
I know this is quite a lot but I would appreciate any and all help.

Wrong fit when using k nearest neighbors regression

I use the nearest neighbors method to predict the price of a stock. I have raw data in example.txt file. I use the close column (price at the end of the period = 1 minute). Linear regression predicts well (shown in green). But the method of nearest neighbors works only at the beginning and then turns into a straight line, please tell me how to fix this? Here is my code I wrote:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
class Reader:
def __init__(self, filename='example.txt'):
self.filename = filename
def read(self):
try:
file = open(self.filename)
return file.read()
except IOError:
return "File not found"
def main():
x = Reader('example.txt')
print(x.read())
class Regression:
def __init__(self, window, P0, Ptest, i):
self.window = window
self.P0 = P0
self.Ptest = Ptest
self.i = i
self.data_train = self.get_data_train()
self.x_train = self.get_x_train()
self.y_train = self.get_y_train()
self.data_test = self.get_data_test()
self.x_test = self.get_x_test()
self.y_test = self.get_y_test()
def get_data_train(self):
""" Method of obtaining data train on prices for the entire period."""
x = Reader('example.txt')
data = x.read().splitlines()
close_column = [x.split(',')[7] for x in data][1:]
result = [float(item) for item in close_column]
relative_price = result[:int(len(result)*P0)]
return relative_price
def get_data_test(self):
""" Method of obtaining data test on prices for the entire period."""
x = Reader('example.txt')
data = x.read().splitlines()
close_column = [x.split(',')[7] for x in data][1:]
result = [float(item) for item in close_column]
len_x_test = int(len(result) * Ptest)
len_x_train = int(len(result) * P0)
relative_price = result[(len_x_train + (len_x_test * self.i)): len_x_train + len_x_test
* (self.i + 1)]
return relative_price
def get_x_train(self):
x = []
for i in range(len(self.data_train)):
if i + self.window < len(self.data_train):
x.append(self.data_train[i: i + self.window])
return x
def get_y_train(self):
y = []
for i in self.data_train[self.window:]:
y += [i]
return y
def get_x_test(self):
x = []
for i in range(len(self.data_test)):
if i + self.window < len(self.data_test):
x.append(self.data_test[i: i + self.window])
return x
def get_y_test(self):
y = []
for i in self.data_test[self.window:]:
y += [i]
return y
class Linear_regression(Regression):
def callculate(self):
reg_linear = LinearRegression().fit(self.x_train, self.y_train)
y_pred = reg_linear.predict(self.x_test)
return y_pred
class Nearest_neighbor(Regression):
def callculate(self):
reg_neighbor = KNeighborsRegressor(n_neighbors=window, weights='distance')
reg_neighbor.fit(self.x_train, self.y_train)
y_pred = reg_neighbor.predict(self.x_test)
return y_pred
window = 10
Pk = 1
P0 = 0.1
Ptest = 0.01
k = (Pk - P0)/Ptest
i = 0
y_real = []
y_neigh = []
y_lin = []
while i < k:
lin_price = list(Linear_regression(window, P0, Ptest, i).callculate())
neighbor = list(Nearest_neighbor(window, P0, Ptest, i).callculate())
y_neigh.extend(neighbor)
y_lin.extend(lin_price)
y_real.extend(list(Linear_regression(window, P0, Ptest, i).y_test))
i += 1
""" Output to graphs of the received data """
fig, ax = plt.subplots()
ax.plot(y_real, label='Initial data')
ax.plot(y_neigh, label='Nearest Neighbor Data')
ax.plot(y_lin, label='Linear Regression Data')
ax.set_xlabel('Time (min)')
ax.set_ylabel('Price, ($)')
ax.legend()
plt.show()
"Linear regression predicts well"
No, it never predicted well. You just looked at the graph and thought it looked kind of similar. But if you look more closely, your 'model' simply takes the price of a bit ago as the prediction of the price now. That means, it's not predicting anything! It's a history device, not a prediction device.
That's why if you feed back this sort of 'model' into itself you get a straight line: it always predicts the next price is going to be equal to the last one.

Pytorch PPO implementation is not learning

This PPO implementation has a bug somewhere and I can't figure out what's wrong. The network returns a normal distribution and a value estimate from the critic. The last layer of the actor provides four F.tanhed action values, which are used as mean value for the distribution. nn.Parameter(torch.zeros(action_dim)) is the standard deviation.
The trajectories for 20 parallel agents are added to the same memory. Episode length is 1000 and memory.sample() returns a np.random.permutation of the 20k memory entries as tensors with batches of size 64. Before stacking the batch tensors, the values are stored as (1,-1) tensors in collection.deques. The returned tensors are detach()ed.
environment
brain_name = envs.brain_names[0]
env_info = envs.reset(train_mode=True)[brain_name]
env_info = envs.step(actions.cpu().detach().numpy())[brain_name]
next_states = env_info.vector_observations
rewards = env_info.rewards
dones = env_info.local_done
update step
def clipped_surrogate_update(policy, memory, num_epochs=10, clip_param=0.2, gradient_clip=5, beta=0.001, value_loss_coeff=0.5):
advantages_batch, states_batch, log_probs_old_batch, returns_batch, actions_batch = memory.sample()
advantages_batch = (advantages_batch - advantages_batch.mean()) / advantages_batch.std()
for _ in range(num_epochs):
for i in range(len(advantages_batch)):
advantages_sample = advantages_batch[i]
states_sample = states_batch[i]
log_probs_old_sample = log_probs_old_batch[i]
returns_sample = returns_batch[i]
actions_sample = actions_batch[i]
dist, values = policy(states_sample)
log_probs_new = dist.log_prob(actions_sample.to(device)).sum(-1).unsqueeze(-1)
entropy = dist.entropy().sum(-1).unsqueeze(-1).mean()
ratio = (log_probs_new - log_probs_old_sample).exp()
clipped_ratio = torch.clamp(ratio, 1-clip_param, 1+clip_param)
clipped_surrogate_loss = -torch.min(ratio*advantages_sample, clipped_ratio*advantages_sample).mean()
value_function_loss = (returns_sample - values).pow(2).mean()
Loss = clipped_surrogate_loss - beta * entropy + value_loss_coeff * value_function_loss
optimizer_policy.zero_grad()
Loss.backward()
torch.nn.utils.clip_grad_norm_(policy.parameters(), gradient_clip)
optimizer_policy.step()
del Loss
data sampling
def collect_trajectories(envs, env_info, policy, memory, tmax=200, nrand=0, gae_tau = 0.95, discount = 0.995):
next_episode = False
states = env_info.vector_observations
n_agents = len(env_info.agents)
state_list=[]
reward_list=[]
prob_list=[]
action_list=[]
value_list=[]
if nrand > 0:
# perform nrand random steps
for _ in range(nrand):
actions = np.random.randn(num_agents, action_size)
actions = np.clip(actions, -1, 1)
env_info = envs.step(actions)[brain_name]
states = env_info.vector_observations
for t in range(tmax):
states = torch.FloatTensor(states).to(device)
dist, values = policy(states)
actions = dist.sample()
probs = dist.log_prob(actions).sum(-1).unsqueeze(-1)
env_info = envs.step(actions.cpu().detach().numpy())[brain_name]
next_states = env_info.vector_observations
rewards = env_info.rewards
dones = env_info.local_done
state_list.append(states)
reward_list.append(rewards)
prob_list.append(probs)
action_list.append(actions)
value_list.append(values)
states = next_states
if np.any(dones):
next_episode = True
break
_, next_value = policy(torch.FloatTensor(states).to(device))
reward_arr = np.array(reward_list)
undiscounted_rewards = np.sum(reward_arr, axis=0)
state_arr = torch.stack(state_list)
prob_arr = torch.stack(prob_list)
action_arr = torch.stack(action_list)
value_arr = torch.stack(value_list)
reward_arr = torch.FloatTensor(reward_arr[:, :, np.newaxis])
advantage_list = []
return_list = []
returns = next_value.detach()
advantages = torch.FloatTensor(np.zeros((n_agents, 1)))
for i in reversed(range(state_arr.shape[0])):
returns = reward_arr[i] + discount * returns
td_error = reward_arr[i] + discount * next_value - value_arr[i]
advantages = advantages * gae_tau * discount + td_error
next_value = value_arr[i]
advantage_list.append(advantages.detach())
return_list.append(returns.detach())
advantage_arr = torch.stack(advantage_list)
return_arr = torch.stack(return_list)
for i in range(state_arr.shape[0]):
memory.add({'advantages': advantage_arr[i],
'states': state_arr[i],
'log_probs_old': prob_arr[i],
'returns': return_arr[i],
'actions': action_arr[i]})
return undiscounted_rewards, next_episode
In the Generalized Advantage Estimation loop advantages and returns are added in reversed order.
advantage_list.insert(0, advantages.detach())
return_list.insert(0, returns.detach())

Earth&Moon orbit system. My data is wrong

There is my code. I fixed it like this:
# Take 3 digits for significant figures in this code
import numpy as np
from math import *
from astropy.constants import *
import matplotlib.pyplot as plt
import time
start_time = time.time()
"""
G = Gravitational constant
g0 = Standard acceleration of gravity ( 9.8 m/s2)
M_sun = Solar mass
M_earth = Earth mass
R_sun = Solar darius
R_earth = Earth equatorial radius
au = Astronomical unit
Astropy.constants doesn't have any parameter of moon.
So I bring the data from wikipedia(https://en.wikipedia.org/wiki/Moon)
"""
M_moon = 7.342E22
R_moon = 1.737E6
M_earth = M_earth.value
R_earth = R_earth.value
G = G.value
perigee, apogee = 3.626E8, 4.054E8
position_E = np.array([0,0])
position_M = np.array([(perigee+apogee)/2.,0])
position_com = (M_earth*position_E+M_moon*position_M)/(M_earth+M_moon)
rel_pE = position_E - position_com
rel_pM = position_M - position_com
F = G*M_moon*M_earth/(position_M[0]**2)
p_E = {"x":rel_pE[0], "y":rel_pE[1],"v_x":0, "v_y":(float(F*rel_pE[0])/M_earth)**.5}
p_M = {"x":rel_pM[0], "y":rel_pM[1],"v_x":0, "v_y":(float(F*rel_pM[0])/M_moon)**.5}
print(p_E, p_M)
t = range(0,365)
data_E , data_M = [], []
def s(initial_velocity, acceleration, time):
result = initial_velocity*time + 0.5*acceleration*time**2
return result
def v(initial_velocity, acceleration, time):
result = initial_velocity + acceleration*time
return result
dist = float(sqrt((p_E["x"]-p_M['x'])**2 + (p_E["y"]-p_M["y"])**2))
xE=[]
yE=[]
xM=[]
yM=[]
data_E, data_M = [None]*len(t), [None]*len(t)
for i in range(1,366):
data_E[i-1] = p_E
data_M[i-1] = p_M
dist = ((p_E["x"]-p_M["x"])**2 + (p_E["y"]-p_M["y"])**2)**0.5
Fg = G*M_moon*M_earth/(dist**2)
theta_E = np.arctan(p_E["y"]/p_E["x"])
theta_M = theta_E + np.pi #np.arctan(data_M[i-1]["y"]/data_M[i-1]["x"])
Fx_E = Fg*np.cos(theta_E)
Fy_E = Fg*np.sin(theta_E)
Fx_M = Fg*np.cos(theta_M)
Fy_M = Fg*np.sin(theta_M)
a_E = Fg/M_earth
a_M = Fg/M_moon
v_E = (p_E["v_x"]**2+p_E["v_y"]**2)**.5
v_M = (p_M["v_x"]**2+p_M["v_y"]**2)**.5
p_E["v_x"] = v(p_E["v_x"], Fx_E/M_earth, 24*3600)
p_E["v_y"] = v(p_E["v_y"], Fy_E/M_earth, 24*3600)
p_E["x"] += s(p_E['v_x'], Fx_E/M_earth, 24*3600)
p_E["y"] += s(p_E['v_y'], Fy_E/M_earth, 24*3600)
p_M["v_x"] = v(p_M["v_x"], Fx_M/M_moon, 24*3600)
p_M["v_y"] = v(p_M["v_y"], Fy_M/M_moon, 24*3600)
p_M["x"] += s(p_M['v_x'], Fx_M/M_moon, 24*3600)
p_M["y"] += s(p_M['v_y'], Fy_M/M_moon, 24*3600)
for i in range(0,len(t)):
xE += data_E[i]["x"]
yE += data_E[i]["y"]
xM += data_M[i]["x"]
yM += data_M[i]["y"]
print("\n Run time \n --- %d seconds ---" %(time.time()-start_time))
after run this code i tried to print data_E and data_M.
Then I can get data but there is no difference. All of the data is the same.
But when I printed data step by step, it totally different.
I have wrong data problem and increase distance problem. Please help me this problem..
The code exits near line 45, where you are trying to assign p_E by pulling the square root of a negative number on the right hand side (as you've moved the [0] coordinate of the Earth to negative values while shifting Earth and Moon into the coordinate system of their center of mass). In line 45, the value of F*rel_pE[0]/M_earth is negative. So the code never reaches the end of the program using python 2.7.14. That bug needs to be solved before trying to discuss any further aspects.

Porting pymc2 code to pymc3: custom likelihood function

I am trying to implement the censored data example in Lee&Wagenmakers' book (Chapter 5.5, page 70). In pymc2, I have the following model:
nattempts = 950
nfails = 949
n = 50 # Number of questions
y = np.zeros(nattempts)
y[nattempts-1] = 1
z = 30
unobsmin = 15
unobsmax = 25
unobsrange = np.arange(unobsmin,unobsmax+1)
theta = pymc.Uniform("theta",lower = .25, upper = 1)
#pymc.observed
def Ylike(value=z, theta = theta, n=n, censorn=nfails, unobs=unobsrange):
ylikeobs = pymc.binomial_like(x=value, n=n, p=theta)
ylikeunobs = np.array([])
for i in unobs:
ylikeunobs = np.append(pymc.binomial_like(x=i, n=n, p=theta),ylikeunobs)
return ylikeobs+sum(ylikeunobs)*censorn
testmodel = pymc.Model([theta,Ylike])
mcmc = pymc.MCMC(testmodel)
mcmc.sample(iter = 20000, burn = 50, thin = 2)
which involved the decorater #pymc.observed.
I think I need to express the likelihood using the pm.DensityDist, however, I could not figure it out how to.
OK, I found out how to do it:
with pm.Model():
theta = pm.Uniform("theta",lower = .25, upper = 1)
def logp(value,n,p):
return pm.dist_math.bound(
pm.dist_math.binomln(n, value)
+ pm.dist_math.logpow(p, value)
+ pm.dist_math.logpow(1 - p, n - value),
0 <= value, value <= n,
0 <= p, p <= 1)
def Censorlike(value=z, n=n, censorn=nfails, unobs=unobsrange):
ylikeobs = logp(value=value, n=n, p=theta)
ylikeunobs = 0
for i in unobs:
ylikeunobs += logp(value=i, n=n, p=theta)
return ylikeobs+ylikeunobs*censorn
ylike = pm.DensityDist('ylike', Censorlike, observed={'value':z,'n':n,'censorn':nfails,'unobs':unobsrange})
trace = pm.sample(3e3)

Categories