How to implement simple Monte Carlo function in pymc - python

I'm trying to get my head around how to implement a Monte Carlo function in python using pymc to replicate a spreadsheet by Douglas Hubbard in his book How to Measure Anything
My attempt was:
import numpy as np
import pandas as pd
from pymc import DiscreteUniform, Exponential, deterministic, Poisson, Uniform, Normal, Stochastic, MCMC, Model
maintenance_saving_range = DiscreteUniform('maintenance_saving_range', lower=10, upper=21)
labour_saving_range = DiscreteUniform('labour_saving_range', lower=-2, upper=9)
raw_material_range = DiscreteUniform('maintenance_saving_range', lower=3, upper=10)
production_level_range = DiscreteUniform('maintenance_saving_range', lower=15000, upper=35000)
#deterministic(plot=False)
def rate(m = maintenance_saving_range, l = labour_saving_range, r=raw_material_range, p=production_level_range):
return (m + l + r) * p
model = Model([rate, maintenance_saving_range, labour_saving_range, raw_material_range, production_level_range])
mc = MCMC(model)
Unfortunately, I'm getting an error: ValueError: A tallyable PyMC object called maintenance_saving_range already exists. This will cause problems for some database backends.
What have I got wrong?

Ah, it was a copy and paste error.
I'd called three distributions by the same name.
Here's the code that works.
import numpy as np
import pandas as pd
from pymc import DiscreteUniform, Exponential, deterministic, Poisson, Uniform, Normal, Stochastic, MCMC, Model
%matplotlib inline
import matplotlib.pyplot as plt
maintenance_saving_range = DiscreteUniform('maintenance_saving_range', lower=10, upper=21)
labour_saving_range = DiscreteUniform('labour_saving_range', lower=-2, upper=9)
raw_material_range = DiscreteUniform('raw_material_range', lower=3, upper=10)
production_level_range = DiscreteUniform('production_level_range', lower=15000, upper=35000)
#deterministic(plot=False, name="rate")
def rate(m = maintenance_saving_range, l = labour_saving_range, r=raw_material_range, p=production_level_range):
#out = np.empty(10000)
out = (m + l + r) * p
return out
model = Model([rate, maintenance_saving_range, labour_saving_range, raw_material_range])
mc = MCMC(model)
mc.sample(iter=10000)

Related

Is there a better way to solve this MINLP in pyscipopt?

I'm trying to solve the following MINLP, basically attempting to maximize the likelihood of a certain portfolio reaching a "ceiling" performance. My first attempt at the code is below.
EDIT: Math says maximize, should say minimize
from pyscipopt import Model, quicksum
import numpy as np
import pandas as pd
from random import uniform, normalvariate
model=Model()
t=20000
stocks_portfolio = {}
stocks_df = pd.DataFrame(np.zeros((150,4)),columns = {'ids','Mean','cost','stdev'})
noptions = len(stocks_df)
stocks_df['ids'] = [i for i in range(noptions)]
stocks_df['Mean'] = [uniform(500,2500) for i in range(noptions)]
stocks_df['cost'] = [stocks_df.loc[i,'Mean']*uniform(50,250) for i in range(noptions)]
stocks_df['stdev'] = [stocks_df.loc[i,'Mean']*uniform(0.2,0.5) for i in range(noptions)]
cov_mat = np.array([[normalvariate(0,0.3) for i in range(noptions)] for j in range(noptions)])
for i in range(len(stocks_df)):
stocks_portfolio[i] = model.addVar(vtype='B')
model.addCons(quicksum(stocks_portfolio[i] for i in range(noptions))==15)
model.addCons(quicksum(stocks_df.loc[i, 'cost']*stocks_portfolio[i] for i in range(noptions)) <= 600000)
stand_in = model.addVar(vtype='C')
model.addCons(stand_in>=(t-quicksum(stocks_df.loc[i,'Mean']*stocks_portfolio[i] for i in range(noptions)))/((quicksum(stocks_portfolio[i]*stocks_df.loc[i,'stdev']**2 for i in range(noptions))+quicksum(2*stocks_portfolio[i]*stocks_portfolio[j]*cov_mat[i,j] for i in range(noptions) for j in range(noptions)))**0.5))
model.setObjective(stand_in,'minimize')
model.optimize()
model.getCondition()
portfolios = []
for i in range(noptions):
if model.getVal(stocks_portfolio[i]) > 0.9:
portfolios.append(i)
The performance here has been slow and unwieldy, and I was wondering if I'm thinking about the question all wrong.

Python statsmodel VARMAX Results

Everytime I run a VARMAX model I get different coefficients.
Is there any way I could replicate my previous results without imposing a seed?
Thank you
I tried to replicate the VARMA(p,q) example posted on the statsmodels webpage: ( https://www.statsmodels.org/dev/examples/notebooks/generated/statespace_varmax.html ). In order to check the replicability of the results, I just added a loop to reestimate the model and a dataframe (parameters) for saving the results. So this is my code:
%matplotlib inline
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
dta = sm.datasets.webuse('lutkepohl2', 'https://www.stata-press.com/data/r12/')
dta.index = dta.qtr
endog = dta.loc['1960-04-01':'1978-10-01', ['dln_inv', 'dln_inc', 'dln_consump']]
exog = endog['dln_consump']
parameters=pd.DataFrame()
for p in range(10):
print(p)
mod = sm.tsa.VARMAX(endog[['dln_inv', 'dln_inc']], order=(1,1))
res = mod.fit(maxiter=1000, disp=False)
print(res.summary())
param= pd.DataFrame(res.params,columns= ["estimation "+str(p)])
parameters=pd.concat([parameters, param], axis=1)
print(parameters)
As you can see, the results change everytime I reestimate the model:
estimation 0 estimation 1 estimation 2 \
const.dln_inv 0.010974 0.010934 0.010934
const.dln_inc 0.016554 0.016536 0.016536
L1.dln_inv.dln_inv -0.010164 -0.010087 -0.010087
L1.dln_inc.dln_inv 0.360306 0.362187 0.362187
L1.dln_inv.dln_inc -0.032975 -0.033071 -0.033071
L1.dln_inc.dln_inc 0.230657 0.231421 0.231421
L1.e(dln_inv).dln_inv -0.249916 -0.250307 -0.250307
L1.e(dln_inc).dln_inv 0.125546 0.125581 0.125581
L1.e(dln_inv).dln_inc 0.088878 0.089001 0.089001
L1.e(dln_inc).dln_inc -0.235258 -0.235176 -0.235176
sqrt.var.dln_inv 0.044926 0.044927 0.044927
sqrt.cov.dln_inv.dln_inc 0.001670 0.001662 0.001662
sqrt.var.dln_inc 0.011554 0.011554 0.011554
Thank you. But I tried to replicate the VARMA(p,q) example posted on the statsmodels webpage: ( https://www.statsmodels.org/dev/examples/notebooks/generated/statespace_varmax.html ). In order to check the replicability of the results, I just added a loop to reestimate the model and a dataframe (parameters) for saving the results. So this is my code:
%matplotlib inline
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
dta = sm.datasets.webuse('lutkepohl2', 'https://www.stata-press.com/data/r12/')
dta.index = dta.qtr
endog = dta.loc['1960-04-01':'1978-10-01', ['dln_inv', 'dln_inc', 'dln_consump']]
exog = endog['dln_consump']
parameters=pd.DataFrame()
for p in range(10):
print(p)
mod = sm.tsa.VARMAX(endog[['dln_inv', 'dln_inc']], order=(1,1))
res = mod.fit(maxiter=1000, disp=False)
print(res.summary())
param= pd.DataFrame(res.params,columns= ["estimation "+str(p)])
parameters=pd.concat([parameters, param], axis=1)
print(parameters)
As you can see, the results change everytime I reestimate the model:
estimation 0 estimation 1 estimation 2 \
const.dln_inv 0.010974 0.010934 0.010934
const.dln_inc 0.016554 0.016536 0.016536
L1.dln_inv.dln_inv -0.010164 -0.010087 -0.010087
L1.dln_inc.dln_inv 0.360306 0.362187 0.362187
L1.dln_inv.dln_inc -0.032975 -0.033071 -0.033071
L1.dln_inc.dln_inc 0.230657 0.231421 0.231421
L1.e(dln_inv).dln_inv -0.249916 -0.250307 -0.250307
L1.e(dln_inc).dln_inv 0.125546 0.125581 0.125581
L1.e(dln_inv).dln_inc 0.088878 0.089001 0.089001
L1.e(dln_inc).dln_inc -0.235258 -0.235176 -0.235176
sqrt.var.dln_inv 0.044926 0.044927 0.044927
sqrt.cov.dln_inv.dln_inc 0.001670 0.001662 0.001662
sqrt.var.dln_inc 0.011554 0.011554 0.011554

How to specify size for bernoulli distribution with pymc3?

In trying to make my way through Bayesian Methods for Hackers, which is in pymc, I came across this code:
first_coin_flips = pm.Bernoulli("first_flips", 0.5, size=N)
I've tried to translate this to pymc3 with the following, but it just returns a numpy array, rather than a tensor (?):
first_coin_flips = pm.Bernoulli("first_flips", 0.5).random(size=50)
The reason the size matters is that it's used later on in a deterministic variable. Here's the entirety of the code that I have so far:
import pymc3 as pm
import matplotlib.pyplot as plt
import numpy as np
import mpld3
import theano.tensor as tt
model = pm.Model()
with model:
N = 100
p = pm.Uniform("cheating_freq", 0, 1)
true_answers = pm.Bernoulli("truths", p)
print(true_answers)
first_coin_flips = pm.Bernoulli("first_flips", 0.5)
second_coin_flips = pm.Bernoulli("second_flips", 0.5)
# print(first_coin_flips.value)
# Create model variables
def calc_p(true_answers, first_coin_flips, second_coin_flips):
observed = first_coin_flips * true_answers + (1-first_coin_flips) * second_coin_flips
# NOTE: Where I think the size param matters, since we're dividing by it
return observed.sum() / float(N)
calced_p = pm.Deterministic("observed", calc_p(true_answers, first_coin_flips, second_coin_flips))
step = pm.Metropolis(model.free_RVs)
trace = pm.sample(1000, tune=500, step=step)
pm.traceplot(trace)
html = mpld3.fig_to_html(plt.gcf())
with open("output.html", 'w') as f:
f.write(html)
f.close()
And the output:
The coin flips and uniform cheating_freq output look correct, but the observed doesn't look like anything to me, and I think it's because I'm not translating that size param correctly.
The pymc3 way to specify the size of a Bernoulli distribution is by using the shape parameter, like:
first_coin_flips = pm.Bernoulli("first_flips", 0.5, shape=N)

scipy can't optimize parameters gives error- RuntimeWarning: invalid value encountered in reduce

I'm trying to optimize parameters using data for my model with scipy optimize but scipy fails to minimize the function and find values of the parameters. It just returns the initial guess which the user gives as input.Also, it gives the following error: RuntimeWarning: invalid value encountered in reduce.
import pandas as pd
import numpy as np
from math import log10
import math
import scipy.optimize as op
from scipy.integrate import odeint
df1 = pd.read_csv('dataset1.csv')
z=df1.loc[: , "z"]
za=z.as_matrix(columns=None)
mu=df1.loc[: , "mu"]
mua=mu.as_matrix(columns=None)
si=df1.loc[: , "sig"]
sia=si.as_matrix(columns=None)
c = 299792.458;
H0 = 70;
m_t=0.3
d_t=0.7
mu0 = 25 + 5*log10(c/H0);
def model(x,t,m,d):
dydt = 1/(math.sqrt((((1+x)**2)*(1+m*x))-(x*d*(2+x))))
return dydt
def Io(zb,m,d):
return odeint(model,0,zb, args=(m,d))
def lnlike(theta,zb, mub,sib):
m, d = theta
isia2 = 1.0/np.square(sib)
return 0.5*(np.sum(((((5*(np.log10((1+zb)*Io(zb,m,d)))+mu0)-mub)**2)*isia2)- np.log(isia2)))
nll = lambda *args: -lnlike(*args)
result = op.minimize(nll, [m_t, d_t], args=(za, mua,sia))
m_ml, d_ml = result["x"]
print(m_ml, d_ml)
I think scipy is not able to handle illegal values generated due to the square root.If so, how can one bypass the illegal values?
the dataset1 file can be found at the link:https://drive.google.com/file/d/1HDzQ7rz_u9y63ECNkhtB49T2KBvu0qu6/view?usp=sharing

Getting standard errors from regressions using rpy2

I am using rpy2 for regressions. The returned object has a list that includes coefficients, residuals, fitted values, rank of the fitted model, etc.)
However I can't find the standard errors (nor the R^2) in the fit object. Running lm directly model in R, standard errors are displayed with the summary command, but I can't access them directly in the model's data frame.
How can I get extract this info using rpy2?
Sample python code is
from scipy import random
from numpy import hstack, array, matrix
from rpy2 import robjects
from rpy2.robjects.packages import importr
def test_regress():
stats=importr('stats')
x=random.uniform(0,1,100).reshape([100,1])
y=1+x+random.uniform(0,1,100).reshape([100,1])
x_in_r=create_r_matrix(x, x.shape[1])
y_in_r=create_r_matrix(y, y.shape[1])
formula=robjects.Formula('y~x')
env = formula.environment
env['x']=x_in_r
env['y']=y_in_r
fit=stats.lm(formula)
coeffs=array(fit[0])
resids=array(fit[1])
fitted_vals=array(fit[4])
return(coeffs, resids, fitted_vals)
def create_r_matrix(py_array, ncols):
if type(py_array)==type(matrix([1])) or type(py_array)==type(array([1])):
py_array=py_array.tolist()
r_vector=robjects.FloatVector(flatten_list(py_array))
r_matrix=robjects.r['matrix'](r_vector, ncol=ncols)
return r_matrix
def flatten_list(source):
return([item for sublist in source for item in sublist])
test_regress()
So this seems to work for me:
def test_regress():
stats=importr('stats')
x=random.uniform(0,1,100).reshape([100,1])
y=1+x+random.uniform(0,1,100).reshape([100,1])
x_in_r=create_r_matrix(x, x.shape[1])
y_in_r=create_r_matrix(y, y.shape[1])
formula=robjects.Formula('y~x')
env = formula.environment
env['x']=x_in_r
env['y']=y_in_r
fit=stats.lm(formula)
coeffs=array(fit[0])
resids=array(fit[1])
fitted_vals=array(fit[4])
modsum = base.summary(fit)
rsquared = array(modsum[7])
se = array(modsum.rx2('coefficients')[2:4])
return(coeffs, resids, fitted_vals, rsquared, se)
Although, as I said, this is literally my first foray into RPy2, so there may be a better way to do that. But this version appears to output arrays containing the R-squared value along with the standard errors.
You can use print(modsum.names) to see the names of the components of the R object (kind of like names(modsum) in R) and then .rx and .rx2 are the equivalent of [ and [[ in R.
#joran: Pretty good. I'd say that it is pretty much the way to do it.
from rpy2 import robjects
from rpy2.robjects.packages import importr
base = importr('base')
stats = importr('stats') # import only once !
def test_regress():
x = base.matrix(stats.runif(100), nrow = 100)
y = (x.ro + base.matrix(stats.runif(100), nrow = 100)).ro + 1 # not so nice
formula = robjects.Formula('y~x')
env = formula.environment
env['x'] = x
env['y'] = y
fit = stats.lm(formula)
coefs = stats.coef(fit)
resids = stats.residuals(fit)
fitted_vals = stats.fitted(fit)
modsum = base.summary(fit)
rsquared = modsum.rx2('r.squared')
se = modsum.rx2('coefficients')[2:4]
return (coefs, resids, fitted_vals, rsquared, se)

Categories