How to specify size for bernoulli distribution with pymc3? - python

In trying to make my way through Bayesian Methods for Hackers, which is in pymc, I came across this code:
first_coin_flips = pm.Bernoulli("first_flips", 0.5, size=N)
I've tried to translate this to pymc3 with the following, but it just returns a numpy array, rather than a tensor (?):
first_coin_flips = pm.Bernoulli("first_flips", 0.5).random(size=50)
The reason the size matters is that it's used later on in a deterministic variable. Here's the entirety of the code that I have so far:
import pymc3 as pm
import matplotlib.pyplot as plt
import numpy as np
import mpld3
import theano.tensor as tt
model = pm.Model()
with model:
N = 100
p = pm.Uniform("cheating_freq", 0, 1)
true_answers = pm.Bernoulli("truths", p)
print(true_answers)
first_coin_flips = pm.Bernoulli("first_flips", 0.5)
second_coin_flips = pm.Bernoulli("second_flips", 0.5)
# print(first_coin_flips.value)
# Create model variables
def calc_p(true_answers, first_coin_flips, second_coin_flips):
observed = first_coin_flips * true_answers + (1-first_coin_flips) * second_coin_flips
# NOTE: Where I think the size param matters, since we're dividing by it
return observed.sum() / float(N)
calced_p = pm.Deterministic("observed", calc_p(true_answers, first_coin_flips, second_coin_flips))
step = pm.Metropolis(model.free_RVs)
trace = pm.sample(1000, tune=500, step=step)
pm.traceplot(trace)
html = mpld3.fig_to_html(plt.gcf())
with open("output.html", 'w') as f:
f.write(html)
f.close()
And the output:
The coin flips and uniform cheating_freq output look correct, but the observed doesn't look like anything to me, and I think it's because I'm not translating that size param correctly.

The pymc3 way to specify the size of a Bernoulli distribution is by using the shape parameter, like:
first_coin_flips = pm.Bernoulli("first_flips", 0.5, shape=N)

Related

Display issue of fitted curve: cannot solve coarseness

Despite having a working script for curve fitting using the lmfit library, I am not able to solve a display issue. Indeed, having only 5 dependent values, the resulting graph is rather coarse.
Before switching to lmfit, I was using curve_fit and could solve the display issue by simply using np.linspace and plot the optimized values resulting from the fit procedure. Then, I was displaying the "real" values through plt.errorbar. With lmfit, the above solution yields a mismatch error, since it recognizes the "fake" independent variables and launches a mismatch type error.
My full script is the following:
import lmfit as lf
from lmfit import Model, Parameters
import numpy as np
import matplotlib.pyplot as plt
from math import atan
def on_res(omega_eff, thetas, R2avg=5, k_ex=0.1, phi_ex=500):
return R2avg*(np.sin(thetas))**2 + ((np.sin(thetas))**2)*(phi_ex*k_ex/(k_ex**2 + omega_eff**2))
model = Model(on_res,independent_vars=['omega_eff','thetas'])
params = model.make_params(R2avg=5, k_ex=0.01, phi_ex=1500)
carrier = 6146.53
O_1 = 5846
spin_locks = (1000, 2000, 3000, 4000, 5000)
delta_omega = (O_1 - carrier)
omega_eff1 = ((delta_omega**2) + (spin_locks[0]**2))**0.5
omega_eff2 = ((delta_omega**2) + (spin_locks[1]**2))**0.5
omega_eff3 = ((delta_omega**2) + (spin_locks[2]**2))**0.5
omega_eff4 = ((delta_omega**2) + (spin_locks[3]**2))**0.5
omega_eff5 = ((delta_omega**2) + (spin_locks[4]**2))**0.5
theta_rad1 = atan(spin_locks[0]/delta_omega)
theta_rad2 = atan(spin_locks[1]/delta_omega)
theta_rad3 = atan(spin_locks[2]/delta_omega)
theta_rad4 = atan(spin_locks[3]/delta_omega)
theta_rad5 = atan(spin_locks[4]/delta_omega)
x = (omega_eff1/1000, omega_eff2/1000, omega_eff3/1000, omega_eff4/1000, omega_eff5/1000)# , omega_eff6/1000)# , omega_eff7/1000)
theta = (theta_rad1, theta_rad2, theta_rad3, theta_rad4, theta_rad5)
R1rho_vals = (7.9328, 6.2642, 6.0005, 5.9972, 5.988)
e = (0.2, 0.2, 0.2, 0.2, 0.2)
new_x = np.linspace(0, 6, 1000)
omega_eff = np.array(x, dtype=float)
thetas = np.array(theta, dtype=float)
R1rho_vals = np.array(R1rho_vals, dtype=float)
error = np.array(e, dtype=float)
R2avg = []
k_ex = []
phi_ex = []
result = model.fit(R1rho_vals, params, weights=1/error, thetas=thetas, omega_eff=omega_eff, method = "emcee", steps = 1000)
print(result.fit_report())
plt.errorbar(x, R1rho_vals, yerr = error, fmt = ".k", markersize = 8, capsize = 3)
plt.plot(new_x, result.best_fit)
plt.show()
As you can see running it, it launches the mismatch shape error message. Changing the plt.plot line to plt.plot(x, result.best_fit) yields the graph correctly, but displaying a very coarse profile (as one would expect, having only 5 points on the x-axis).
Are you aware of any way to solve this? Checking the documentation, I noticed the examples provided all plot the results via the actual independent variables values, since they have enough experimental values.
You need to re-evaluate the ModelResult with your new values for the independent variables:
plt.plot(new_x, result.eval(omega_eff=new_x/1000., thetas=thetas))

Implementing STFT with Pytorch gives a slightly different result than the STFT with Librose

I am trying to implement STFT with Pytorch. But the output from the Pytorch implementation is slightly off, when compared with the implementation from Librosa.
Librosa version
import numpy as np
from librosa.core import stft
import matplotlib.pyplot as plt
np.random.seed(3)
y = np.sin(2*np.pi*50*np.linspace(0,10,2048))+np.sin(2*np.pi*20*np.linspace(0,10,2048)) + np.random.normal(scale=1,size=2048)
S_stft = np.abs(stft(y, hop_length=512, n_fft=2048,center=False))
plt.plot(S_stft)
Pytorch version
import torch
from torch.autograd import Variable
from torch.nn.functional import conv1d
from scipy.signal.windows import hann
stride = 512
def create_filters(d,k,low=50,high=6000):
x = np.arange(0, d, 1)
wsin = np.empty((k,1,d), dtype=np.float32)
wcos = np.empty((k,1,d), dtype=np.float32)
start_freq = low
end_freq = high
# num_cycles = start_freq*d/44000.
# scaling_ind = np.log(end_freq/start_freq)/k
window_mask = hann(2048, sym=False) # same as 0.5-0.5*np.cos(2*np.pi*x/(k))
for ind in range(k):
wsin[ind,0,:] = window_mask*np.sin(2*np.pi*ind/k*x)
wcos[ind,0,:] = window_mask*np.cos(2*np.pi*ind/k*x)
return wsin,wcos
wsin, wcos = create_filters(2048,2048)
wsin_var = Variable(torch.from_numpy(wsin), requires_grad=False)
wcos_var = Variable(torch.from_numpy(wcos),requires_grad=False)
network_input = torch.from_numpy(y).float()
network_input = network_input.reshape(1,-1)
zx = np.sqrt(conv1d(network_input[:,None,:], wsin_var, stride=stride).pow(2)+conv1d(network_input[:,None,:], wcos_var, stride=stride).pow(2))
pytorch_Xs = zx.cpu().numpy()
plt.plot(pytorch_Xs[0,:1025,0])
My Question
The two graphs might look the same, but if I check the two outputs with np.allclose, we can see that they are slightly different.
np.allclose(S_stft, pytorch_Xs[0,:1025,0].reshape(1025,1))
output >>> False
Only when I tune up the tolerance to 1e-5, it gives me True result
np.allclose(S_stft, pytorch_Xs[0,:1025,0].reshape(1025,1),atol=1e-5)
output >>> True
What causes the difference in values? Is it because of the data conversion by using torch.from_numpy(y).float()?
I would like to have a difference in value less than 1e-7, 1e-8 is even better.
The difference is from the difference between their default bit.
NumPy's float is 64bit by default.
PyTorch's float is 32bit by default.

Use Python lmfit with a variable number of parameters in function

I am trying to deconvolve complex gas chromatogram signals into individual gaussian signals. Here is an example, where the dotted line represents the signal I am trying to deconvolve.
I was able to write the code to do this using scipy.optimize.curve_fit; however, once applied to real data the results were unreliable. I believe being able to set bounds to my parameters will improve my results, so I am attempting to use lmfit, which allows this. I am having a problem getting lmfit to work with a variable number of parameters. The signals I am working with may have an arbitrary number of underlying gaussian components, so the number of parameters I need will vary. I found some hints here, but still can't figure it out...
Creating a python lmfit Model with arbitrary number of parameters
Here is the code I am currently working with. The code will run, but the parameter estimates do not change when the model is fit. Does anyone know how I can get my model to work?
import numpy as np
from collections import OrderedDict
from scipy.stats import norm
from lmfit import Parameters, Model
def add_peaks(x_range, *pars):
y = np.zeros(len(x_range))
for i in np.arange(0, len(pars), 3):
curve = norm.pdf(x_range, pars[i], pars[i+1]) * pars[i+2]
y = y + curve
return(y)
# generate some fake data
x_range = np.linspace(0, 100, 1000)
peaks = [50., 40., 60.]
a = norm.pdf(x_range, peaks[0], 5) * 2
b = norm.pdf(x_range, peaks[1], 1) * 0.1
c = norm.pdf(x_range, peaks[2], 1) * 0.1
fake = a + b + c
param_dict = OrderedDict()
for i in range(0, len(peaks)):
param_dict['pk' + str(i)] = peaks[i]
param_dict['wid' + str(i)] = 1.
param_dict['mult' + str(i)] = 1.
# In case, you'd like to see the plot of fake data
#y = add_peaks(x_range, *param_dict.values())
#plt.plot(x_range, y)
#plt.show()
# Initialize the model and fit
pmodel = Model(add_peaks)
params = pmodel.make_params()
for i in param_dict.keys():
params.add(i, value=param_dict[i])
result = pmodel.fit(fake, params=params, x_range=x_range)
print(result.fit_report())
I think you would be better off using lmfits ability to build composite model.
That is, with a single peak defined with
from scipy.stats import norm
def peak(x, amp, center, sigma):
return amp * norm.pdf(x, center, sigma)
(see also lmfit.models.GaussianModel), you can build a model with many peaks:
npeaks = 3
model = Model(peak, prefix='p1_')
for i in range(1, npeaks):
model = model + Model(peak, prefix='p%d_' % (i+1))
params = model.make_params()
Now model will be a sum of 3 Gaussian functions, and the params created for that model will have names like p1_amp, p1_center, p2_amp, ..., which you can add sensible initial values and/or bounds and/or constraints.
Given your example data, you could pass in initial values to make_params like
params = model.make_params(p1_amp=2.0, p1_center=50., p1_sigma=2,
p2_amp=0.2, p2_center=40., p2_sigma=2,
p3_amp=0.2, p3_center=60., p3_sigma=2)
result = model.fit(fake, params, x=x_range)
I was able to find a solution here:
https://lmfit.github.io/lmfit-py/builtin_models.html#example-3-fitting-multiple-peaks-and-using-prefixes
Building on the code above, the following accomplishes what I was trying to do...
from lmfit.models import GaussianModel
gauss1 = GaussianModel(prefix='g1_')
gauss2 = GaussianModel(prefix='g2_')
gauss3 = GaussianModel(prefix='g3_')
gauss4 = GaussianModel(prefix='g4_')
gauss5 = GaussianModel(prefix='g5_')
gauss = [gauss1, gauss2, gauss3, gauss4, gauss5]
prefixes = ['g1_', 'g2_', 'g3_', 'g4_', 'g5_']
mod = np.sum(gauss[0:len(peaks)])
pars = mod.make_params()
for i, prefix in zip(range(0, len(peaks)), prefixes[0:len(peaks)]):
pars[prefix + 'center'].set(peaks[i])
init = mod.eval(pars, x=x_range)
out = mod.fit(fake, pars, x=x_range)
print(out.fit_report(min_correl=0.5))
out.plot_fit()
plt.show()

How to implement simple Monte Carlo function in pymc

I'm trying to get my head around how to implement a Monte Carlo function in python using pymc to replicate a spreadsheet by Douglas Hubbard in his book How to Measure Anything
My attempt was:
import numpy as np
import pandas as pd
from pymc import DiscreteUniform, Exponential, deterministic, Poisson, Uniform, Normal, Stochastic, MCMC, Model
maintenance_saving_range = DiscreteUniform('maintenance_saving_range', lower=10, upper=21)
labour_saving_range = DiscreteUniform('labour_saving_range', lower=-2, upper=9)
raw_material_range = DiscreteUniform('maintenance_saving_range', lower=3, upper=10)
production_level_range = DiscreteUniform('maintenance_saving_range', lower=15000, upper=35000)
#deterministic(plot=False)
def rate(m = maintenance_saving_range, l = labour_saving_range, r=raw_material_range, p=production_level_range):
return (m + l + r) * p
model = Model([rate, maintenance_saving_range, labour_saving_range, raw_material_range, production_level_range])
mc = MCMC(model)
Unfortunately, I'm getting an error: ValueError: A tallyable PyMC object called maintenance_saving_range already exists. This will cause problems for some database backends.
What have I got wrong?
Ah, it was a copy and paste error.
I'd called three distributions by the same name.
Here's the code that works.
import numpy as np
import pandas as pd
from pymc import DiscreteUniform, Exponential, deterministic, Poisson, Uniform, Normal, Stochastic, MCMC, Model
%matplotlib inline
import matplotlib.pyplot as plt
maintenance_saving_range = DiscreteUniform('maintenance_saving_range', lower=10, upper=21)
labour_saving_range = DiscreteUniform('labour_saving_range', lower=-2, upper=9)
raw_material_range = DiscreteUniform('raw_material_range', lower=3, upper=10)
production_level_range = DiscreteUniform('production_level_range', lower=15000, upper=35000)
#deterministic(plot=False, name="rate")
def rate(m = maintenance_saving_range, l = labour_saving_range, r=raw_material_range, p=production_level_range):
#out = np.empty(10000)
out = (m + l + r) * p
return out
model = Model([rate, maintenance_saving_range, labour_saving_range, raw_material_range])
mc = MCMC(model)
mc.sample(iter=10000)

Getting standard errors from regressions using rpy2

I am using rpy2 for regressions. The returned object has a list that includes coefficients, residuals, fitted values, rank of the fitted model, etc.)
However I can't find the standard errors (nor the R^2) in the fit object. Running lm directly model in R, standard errors are displayed with the summary command, but I can't access them directly in the model's data frame.
How can I get extract this info using rpy2?
Sample python code is
from scipy import random
from numpy import hstack, array, matrix
from rpy2 import robjects
from rpy2.robjects.packages import importr
def test_regress():
stats=importr('stats')
x=random.uniform(0,1,100).reshape([100,1])
y=1+x+random.uniform(0,1,100).reshape([100,1])
x_in_r=create_r_matrix(x, x.shape[1])
y_in_r=create_r_matrix(y, y.shape[1])
formula=robjects.Formula('y~x')
env = formula.environment
env['x']=x_in_r
env['y']=y_in_r
fit=stats.lm(formula)
coeffs=array(fit[0])
resids=array(fit[1])
fitted_vals=array(fit[4])
return(coeffs, resids, fitted_vals)
def create_r_matrix(py_array, ncols):
if type(py_array)==type(matrix([1])) or type(py_array)==type(array([1])):
py_array=py_array.tolist()
r_vector=robjects.FloatVector(flatten_list(py_array))
r_matrix=robjects.r['matrix'](r_vector, ncol=ncols)
return r_matrix
def flatten_list(source):
return([item for sublist in source for item in sublist])
test_regress()
So this seems to work for me:
def test_regress():
stats=importr('stats')
x=random.uniform(0,1,100).reshape([100,1])
y=1+x+random.uniform(0,1,100).reshape([100,1])
x_in_r=create_r_matrix(x, x.shape[1])
y_in_r=create_r_matrix(y, y.shape[1])
formula=robjects.Formula('y~x')
env = formula.environment
env['x']=x_in_r
env['y']=y_in_r
fit=stats.lm(formula)
coeffs=array(fit[0])
resids=array(fit[1])
fitted_vals=array(fit[4])
modsum = base.summary(fit)
rsquared = array(modsum[7])
se = array(modsum.rx2('coefficients')[2:4])
return(coeffs, resids, fitted_vals, rsquared, se)
Although, as I said, this is literally my first foray into RPy2, so there may be a better way to do that. But this version appears to output arrays containing the R-squared value along with the standard errors.
You can use print(modsum.names) to see the names of the components of the R object (kind of like names(modsum) in R) and then .rx and .rx2 are the equivalent of [ and [[ in R.
#joran: Pretty good. I'd say that it is pretty much the way to do it.
from rpy2 import robjects
from rpy2.robjects.packages import importr
base = importr('base')
stats = importr('stats') # import only once !
def test_regress():
x = base.matrix(stats.runif(100), nrow = 100)
y = (x.ro + base.matrix(stats.runif(100), nrow = 100)).ro + 1 # not so nice
formula = robjects.Formula('y~x')
env = formula.environment
env['x'] = x
env['y'] = y
fit = stats.lm(formula)
coefs = stats.coef(fit)
resids = stats.residuals(fit)
fitted_vals = stats.fitted(fit)
modsum = base.summary(fit)
rsquared = modsum.rx2('r.squared')
se = modsum.rx2('coefficients')[2:4]
return (coefs, resids, fitted_vals, rsquared, se)

Categories