The code below provides a complete reproducible example. My question is (mainly) on my function thetaMax(). This is a function which minimizes a log-likelihood of a psychometric process.
I am learning Python and doing so by translating my R functions to python. The code below works as expected. However, because I am learning Python, but my question is on style and quality.
The function thetaMax would run over hundreds of thousands of individuals and in my R code is efficient and is split among multiple cores. But, before thinking about parallel processing, my first goal is make the python code as fast and efficient as possible.
There are probably many parts of my function thetaMax that can be improved, but one aspect I am most concerned about is:
for i in range(0,len(x2)):
result[i] = (gpcm(theta, d = d[i], score = x2[i], a = 1, D = D))
I think doing this as a loop is probably bad and could be improved with some form of vectorization. Below are the complete contents necessary to implement this code, and thanks to anyone willing to offer suggestions on how to improve the code.
import numpy as np
from scipy.stats import binom
from scipy.optimize import minimize
def prob3pl(theta, a, b, c, D = 1.7):
result = c + (1 - c) / (1 + np.exp(-D * a * (theta - b)))
return(result)
def gpcm(theta, d, score, a, D = 1.7):
Da = D * a
result = np.exp(np.sum(Da * (theta - d[0:score])))/np.sum(np.exp(np.cumsum(Da * (theta - d))))
return(result)
d = np.array([[0, -1, .5, 1],[0,-.5,.2,1]])
a = np.array([1,1,1,1,1])
b = np.array([-1,.5,-.5,0,2])
c = np.array([0,0,0,0,0])
x = np.array([1,1,0,1,0,1,1])
indDichot = range(0,5,1)
def thetaMax(x, indDichot, a, b, c, D, d):
x1 = x[indDichot]
x2 = np.delete(x, indDichot)
result = [0] * len(x2)
def fn(theta):
if(len(x1) > 0):
p = prob3pl(theta, a, b, c, D = D)
logDichPart = sum(np.log(binom.pmf(x1,1,p)))
else:
logPolyPart = 0
if(len(x2) > 0):
for i in range(0,len(x2)):
result[i] = (gpcm(theta, d = d[i], score = x2[i], a = 1, D = D))
logPolyPart = sum(np.log(result))
else:
logPolyPart = 0
LL = -(logDichPart + logPolyPart)
return(LL)
out = minimize(fn, x0=0)
return(out)
thetaMax(x,indDichot,a,b,c,D=1,d = d)
Related
I am trying to code a simple equilibrium reaction where
[A]+[B] <-> [C] + [D]
[A]=50
[B]=50
[C]=50
[D]=50
k1 (forward)= 1
k-1 (reverse)= .5
Thank you
Your question is not very clear,
but i guess you would like to have a program that calculates the equilibrium concentrations of the reaction.
I also suppose that the input data you gave, [A], ..., [D], correspond to the initial concentrations.
Assuming that, the first thing I would advise you - before writing any line of code,
is to establish the formula that gives the equilibrium concentrations; and then implement it.
So, let's do first a bit of chemistry and maths :)
(I've based my reasoning on this course: http://cinet.chim.pagesperso-orange.fr/cours/chap3.html#L18704)
First, the extent at equilibrium is:
xe = [A]0-[A]∞ = [B]0-[B]∞ = [C]∞-[C]0 = [D]∞-[D]0 (1)
Also at equilibrium, you have the Law of Mass Action:
K(constant) = k1/k-1 = [C]∞⋅[D]∞/[A]∞⋅[B]∞ (2)
You can express the constant K as a function of [A]∞,
replacing [B]∞, [C]∞ and [D]∞ in (2) using equations in (1):
[B]∞ = [B]0 + [A]∞ - [A]0[C]∞ = [C]0 + [A]0 - [A]∞[D]∞ = [D]0 + [A]0 - [A]∞K = ([C]0+[A]0-[A]∞)⋅([D]0+[A]0-[A]∞)/[A]∞⋅([B]0+[A]∞-[A]0) (3)
Where (3) can be seen as a second degree equation:
x := [A]∞
(K-1)⋅x2 + [2⋅[A]0 + [C]0 + [D]0 + K⋅([B]0-[A]0)]⋅x - [[A]0⋅([A]0+[C]0+[D]0) + [C]0⋅[D]0] = 0
Similarly or by symmetry, you could have done the same for [B]∞, [C]∞ or [D]∞.
But it is easier to reuse (1).
Now, we can see better how to proceed. Let's code it!
import math
# Solve the quadratic equation a⋅x² + b⋅x + c = 0
def solve_2nd_degree_equation(a, b, c):
discriminant = (b ** 2) - (4 * a * c)
solution_1 = (-b - math.sqrt(discriminant)) / (2 * a)
solution_2 = (-b + math.sqrt(discriminant)) / (2 * a)
return [solution_1, solution_2]
The function above will be used for solving second degree equations.
class EquilibriumReactionProblem(object):
def __init__(self, a0, b0, c0, d0, k_forward, k_reverse):
self.a0 = a0
self.b0 = b0
self.c0 = c0
self.d0 = d0
self.k = k_forward / k_reverse
def _compute_a_equilibrium(self):
a = self.k - 1
b = 2 * self.a0 + self.c0 + self.d0 + self.k * (self.b0 - self.a0)
c = -(self.a0 * (self.a0 + self.c0 + self.d0) + self.c0 * self.d0)
[res] = [s for s in solve_2nd_degree_equation(a, b, c) if 0 <= s]
return round(res, 2)
def solve(self):
a_equilibrium = self._compute_a_equilibrium()
b_equilibrium = self.b0 + a_equilibrium - self.a0
c_equilibrium = self.c0 + self.a0 - a_equilibrium
d_equilibrium = self.d0 + self.a0 - a_equilibrium
print({
'a_equilibrium': a_equilibrium,
'b_equilibrium': b_equilibrium,
'c_equilibrium': c_equilibrium,
'd_equilibrium': d_equilibrium
})
All we have to do is now to run it with the initial conditions:
EquilibriumReactionProblem(
a0=50,
b0=50,
c0=50,
d0=50,
k_forward=1,
k_reverse=0.5
).solve()
{'a_equilibrium': 41.42, 'b_equilibrium': 41.42, 'c_equilibrium': 58.58, 'd_equilibrium': 58.58}
EDIT: You may also take advantage of ChemPy lib.
I'm trying to optimize simple integration in python which looks something like
from scipy import integrate
import numpy as np
from scipy.special import kv
import time
#Example function
def integrand(x, a, b, c):
return a * (x ** (-b)) * (np.sqrt(x ** (c) + 1) - 1)
#Real Function that I want to calculate
def Bes(xx):
return integrate.quad(lambda x: kv(5./3.,x), xx,np.inf)
def F(x,a,b,c,d,e,f):
zx = 1/((x**2.+1)*a)
feq = e*x**(f)
if (x>c):
feq *= c/x * np.exp(-(x/d)**2.)
return b*Bes(zx)*feq*x**2.
start = time.time()
array_length = 10
a = np.random.rand(array_length)+3.
b = np.random.rand(array_length)+1.
c = np.random.rand(array_length)
d = (np.random.rand(array_length)+1)*100.
e = np.random.rand(array_length)*100.
f = np.random.rand(array_length)
inte = np.array([])
for i in range(array_length):
result = integrate.quad(lambda x: F(x, a[i], b[i], c[i],d[i],e[i],f[i]),0.01,100000.)
inte = np.append(inte,result[0])
print("For array length = %i" % array_length)
print("Time = %.2f [sec]" %(time.time()-start))
But the problems that I'm facing are
a, b, c are array with length > 10^7 (same length)
integration range of x starts at 0.01 and extends to infinite
Integration at the small x (like [0.01, 1]) is very important and needs small step.
I want to integrate this function on each coefficient value and returns the entire array of integration as the result (length ~ 10^7), efficiently.
What kind of tools should I use?
(+) I just changed my code from simple example to actual integration form that I need to solve. Sorry for making confusion.
I suspected that this integral would converge for certain values of b and c, so I tried to evaluate this using Sympy:
import sympy
sympy.init_printing()
a, b, c = sympy.symbols('a, b, c', positive=True)
x = sympy.Symbol('x', positive=True)
sympy.integrate(a*(x**(-b))*(sympy.sqrt(x**c+1)-1), (x, 0, sympy.oo))
This means that you should be able to obtain the correct results with this code as long as your coefficients pass the check function.
from numpy import sqrt, pi
from scipy.special import gamma
def check(a, b, c):
assert (-(-b + 1)/c < 1)
assert (1/2 - (-b + 1)/c > 1)
assert (1 - (-b + 1)/c > 1)
def result(a, b, c):
return a*gamma(-b/c + 1 + 1/c)*gamma(b/c - 1/2 - 1/c)/(2*sqrt(pi)*(b - 1))
So i've made a simple program for numerically aproximating double integral, which accepts that the bounds of the inner integral are funcions:
def double_integral(func, limits, res=1000):
t = time.clock()
t1 = time.clock()
t2 = time.clock()
s = 0
a, b = limits[0], limits[1]
outer_values = np.linspace(a, b, res)
c_is_func = callable(limits[2])
d_is_func = callable(limits[3])
for y in outer_values:
if c_is_func:
c = limits[2](y)
else:
c = limits[2]
if d_is_func:
d = limits[3](y)
else:
d = limits[3]
dA = ((b - a) / res) * ((d - c) / res)
inner_values = np.linspace(c, d, res)
for x in inner_values:
t2 = time.clock() - t2
s += func(x, y) * dA
t1 = time.clock() - t1
t = time.clock() - t
return s, t, t1 / res, t2 / res**2
This is, however, terribly slow. When res=1000, such that the integral is a sum of a million parts, it takes about 5 seconds to run, but the answer is only correct to about the 3rd decimal in my experience. Is there any way to speed this up?
The code i am running to check the integral is
def f(x, y):
if (4 - y**2 - x**2) < 0:
return 0 #This is to avoid taking the root of negarive #'s
return np.sqrt(4 - y**2 - x**2)
def c(y):
return np.sqrt(2 * y - y**2)
def d(y):
return np.sqrt(4 - y**2)
# b d
# S S f(x,y) dx dy
# a c
a, b, = 0, 2
print(double_integral(f, [a, b, c, d]))
The integral is eaqual to 16/9
Edit
so i got a great answer over at coderewiev, but i am still baffeled by how scipy.integrate.dblquad seem to give me the wrong answer (see comment). does anyone have an answer for this?
To do some simulations in Python, I'm trying to generate numbers a,b,c such that a^2 + b^2 + c^2 = 1. I think generating some a between 0 and 1, then some b between 0 and sqrt(1 - a^2), and then c = sqrt(1 - a^2 - b^2) would work.
Floating point values are fine, the sum of squares should be close to 1. I want to keep generating them for some iterations.
Being new to Python, I'm not really sure how to do this. Negatives are allowed.
Edit: Thanks a lot for the answers!
According to this answer at stats.stackexchange.com, you should use normally distributed values to get uniformly distributed values on a sphere. That would mean, you could do:
import numpy as np
abc = np.random.normal(size=3)
a,b,c = abc/np.sqrt(sum(abc**2))
Just in case your interested in the probability densities I decided to do a comparison between the different approaches:
import numpy as np
import random
import math
def MSeifert():
a = 1
b = 1
while a**2 + b**2 > 1: # discard any a and b whose sum of squares already exceeds 1
a = random.random()
b = random.random()
c = math.sqrt(1 - a**2 - b**2) # fixed c
return a, b, c
def VBB():
x = np.random.uniform(0,1,3) # random numbers in [0, 1)
x /= np.sqrt(x[0] ** 2 + x[1] ** 2 + x[2] ** 2)
return x[0], x[1], x[2]
def user3684792():
theta = random.uniform(0, 0.5*np.pi)
phi = random.uniform(0, 0.5*np.pi)
return np.sin(theta)* np.cos(phi), np.sin(theta)*np.sin(phi), np.cos(theta)
def JohanL():
abc = np.random.normal(size=3)
a,b,c = abc/np.sqrt(sum(abc**2))
return a, b, c
def SeverinPappadeux():
cos_th = 2.0*random.uniform(0, 1.0) - 1.0
sin_th = math.sqrt(1.0 - cos_th*cos_th)
phi = random.uniform(0, 2.0*math.pi)
return sin_th * math.cos(phi), sin_th * math.sin(phi), cos_th
And plotting the distributions:
%matplotlib notebook
import matplotlib.pyplot as plt
f, axes = plt.subplots(3, 4)
for func_idx, func in enumerate([MSeifert, JohanL, user3684792, VBB]):
axes[0, func_idx].set_title(str(func.__name__))
res = [func() for _ in range(50000)]
for idx in range(3):
axes[idx, func_idx].hist([i[idx] for i in res], bins='auto')
axes[0, 0].set_ylabel('a')
axes[1, 0].set_ylabel('b')
axes[2, 0].set_ylabel('c')
plt.tight_layout()
With the result:
Explanation: The rows show the distributions for a, b and c respectively while the columns show the histograms (distributions) of the different approaches.
The only approaches that give a uniformly random distribution in the range (-1, 1) are JohanLs and Severin Pappadeux's approach. All other approaches have some features like spikes or a functional behavior in the range [0, 1). Note that these two solution currently gives values between -1 and 1 while all other approaches give values between 0 and 1.
I think it is actually a cool problem, and a nice way to do this is to just use spherical polar coordinates and generate the angles at random.
import random
import numpy as np
def random_pt():
theta = random.uniform(0, 0.5*np.pi)
phi = random.uniform(0, 0.5*np.pi)
return np.sin(theta)* np.cos(phi), np.sin(theta)*np.sin(phi), np.cos(theta)
You could do it like this:
import random
import math
def three_random_numbers_adding_to_one():
a = 1
b = 1
while a**2 + b**2 > 1: # discard any a and b whose sum of squares already exceeds 1
a = random.random()
b = random.random()
c = math.sqrt(1 - a**2 - b**2) # fixed c
return a, b, c
a, b, c = three_random_numbers_adding_to_one()
print(a**2 + b**2 + c**2)
However floats have only limited precision so these won't add to exactly 1, just approximately.
You may need to check if the numbers generated with this function are "random enough". It could be that this setup biases the "randomness".
The "right" answer depends on whether you are looking for a uniform random distribution in space, or on the surface of a sphere, or something else. If you are looking for points on the surface of a sphere, you still have to worry about the cos(theta) factor which will cause points to appear "bunched up" near the poles of the sphere. Since exact nature is not clear from your question, here is a "totally random" distribution that should work:
x = np.random.uniform(0,1,3) # random numbers in [0, 1)
x /= np.sqrt(x[0] ** 2 + x[1] ** 2 + x[2] ** 2)
Another advantage here is that since we are using numpy arrays, you can quickly scale to large sets of points too, by using x = np.random.uniform(0, 1, (3, n)) for any n.
Time to add another solution, heh...
This time it is truly uniform on the unit sphere point picking - check http://mathworld.wolfram.com/SpherePointPicking.html for details
import math
import random
def random_pt():
cos_th = 2.0*random.uniform(0, 1.0) - 1.0
sin_th = math.sqrt(1.0 - cos_th*cos_th)
phi = random.uniform(0, 2.0*math.pi)
return sin_th * math.cos(phi), sin_th * math.sin(phi), cos_th
for k in range(0, 100):
a, b, c = random_pt()
print("{0} {1} {2} {3}".format(a, b, c, a*a + b*b + c*c))
I have a CSV that provides a y value for three different x values for each row. When read into a pandas DataFrame, it looks like this:
5 10 20
0 -13.6 -10.7 -10.3
1 -14.1 -11.2 -10.8
2 -12.3 -9.4 -9.0
That is, for row 0, at 5 the value is -13.6, at 10 the value is -10.7, and at 20 the value is -10.3. These values are the result of an algorithm in the form:
def calc(x, r, b, c, d):
if x < 10:
y = (x * r + b) / x
elif x >= 10 and x < 20:
y = ((x * r) + (b - c)) / x
else:
y = ((x * r) + (b - d)) / x
return y
I want to find the value of r, b, c, and d for each row. I know certain things about each of the values. For example, for each row: r is in np.arange(-.05, -.11, -.01), b is in np.arange(0, -20.05, -.05), and c and d are in np.arange(0, 85, 5). I also know that d is <= c.
Currently, I am solving this with brute force. For each row, I iterate through every combination of r, b, c, and d and test if the value at the three x values is equal to the known value from the DataFrame. This works, giving me a few combinations for each row that are basically the same except for rounding differences.
The problem is that this approach takes a long time when I need to run it against 2,000+ rows. My question is: is there a faster way than iterating and testing every combination? My understanding is that this is a constraint satisfaction problem but, after that, I have no idea what to narrow in on; there are so many types of constraint satisfaction problems (it seems) that I'm still lost (I'm not even certain that this is such a problem!). Any help in pointing me in the right direction would be greatly appreciated.
I hope i understood the task correctly.
If you know the resolution/discretization of the parameters, it looks like a discrete-optimization problem (in general: hard), which could be solved by CP-approaches.
But if you allow these values to be continuous (and reformulate the formulas), it is:
(1) a Linear Program: if checking for feasible values (there needs to be a valid solution)
(2) a Linear Program: if optimizing parameters for minimization of sum of absolute differences (=errors)
(3) a Quadratic Program: if optimizing parameters for minimization of sum of squared differences (=errors) / equivalent to minimizing euclidean-norm
All three versions can be solved efficiently!
Here is a non-general (could be easily generalized) implementation of (3) using cvxpy to formulate the problem and ecos to solve the QP. Both tools are open-source.
Code
import numpy as np
import time
from cvxpy import *
from random import uniform
""" GENERATE TEST DATA """
def sample_params():
while True:
r = uniform(-0.11, -0.05)
b = uniform(-20.05, 0)
c = uniform(0, 85)
d = uniform(0, 85)
if d <= c:
return r, b, c, d
def calc(x, r, b, c, d):
if x < 10:
y = (x * r + b) / x
elif x >= 10 and x < 20:
y = ((x * r) + (b - c)) / x
else:
y = ((x * r) + (b - d)) / x
return y
N = 2000
sampled_params = [sample_params() for i in range(N)]
data_5 = np.array([calc(5, *sampled_params[i]) for i in range(N)])
data_10 = np.array([calc(10, *sampled_params[i]) for i in range(N)])
data_20 = np.array([calc(20, *sampled_params[i]) for i in range(N)])
data = np.empty((N, 3))
for i in range(N):
data[i, :] = [data_5[i], data_10[i], data_20[i]]
""" SOLVER """
def solve(row):
""" vars """
R = Variable(1)
B = Variable(1)
C = Variable(1)
D = Variable(1)
E = Variable(3)
""" constraints """
constraints = []
# bounds
constraints.append(R >= -.11)
constraints.append(R <= -.05)
constraints.append(B >= -20.05)
constraints.append(B <= 0.0)
constraints.append(C >= 0.0)
constraints.append(C <= 85.0)
constraints.append(D >= 0.0)
constraints.append(D <= 85.0)
constraints.append(D <= C)
# formula of model
constraints.append((1.0 / 5.0) * B + R == row[0] + E[0]) # alternate function form: b/x+r
constraints.append((1.0 / 10.0) * B - (1.0 / 10.0) * C == row[1] + E[1]) # alternate function form: b/x-c/x+r
constraints.append((1.0 / 20.0) * B - (1.0 / 20.0) * D == row[2] + E[2]) # alternate function form: b/x-d/x+r
""" Objective """
objective = Minimize(norm(E, 2))
""" Solve """
problem = Problem(objective, constraints)
problem.solve(solver=ECOS, verbose=False)
return R.value, B.value, C.value, D.value, E.value
start = time.time()
for i in range(N):
r, b, c, d, e = solve(data[i])
end = time.time()
print('seconds taken: ', end-start)
print('seconds per row: ', (end-start) / N)
Output
('seconds taken: ', 20.620506048202515)
('seconds per row: ', 0.010310253024101258)