Python integration of large array of coefficient - python

I'm trying to optimize simple integration in python which looks something like
from scipy import integrate
import numpy as np
from scipy.special import kv
import time
#Example function
def integrand(x, a, b, c):
return a * (x ** (-b)) * (np.sqrt(x ** (c) + 1) - 1)
#Real Function that I want to calculate
def Bes(xx):
return integrate.quad(lambda x: kv(5./3.,x), xx,np.inf)
def F(x,a,b,c,d,e,f):
zx = 1/((x**2.+1)*a)
feq = e*x**(f)
if (x>c):
feq *= c/x * np.exp(-(x/d)**2.)
return b*Bes(zx)*feq*x**2.
start = time.time()
array_length = 10
a = np.random.rand(array_length)+3.
b = np.random.rand(array_length)+1.
c = np.random.rand(array_length)
d = (np.random.rand(array_length)+1)*100.
e = np.random.rand(array_length)*100.
f = np.random.rand(array_length)
inte = np.array([])
for i in range(array_length):
result = integrate.quad(lambda x: F(x, a[i], b[i], c[i],d[i],e[i],f[i]),0.01,100000.)
inte = np.append(inte,result[0])
print("For array length = %i" % array_length)
print("Time = %.2f [sec]" %(time.time()-start))
But the problems that I'm facing are
a, b, c are array with length > 10^7 (same length)
integration range of x starts at 0.01 and extends to infinite
Integration at the small x (like [0.01, 1]) is very important and needs small step.
I want to integrate this function on each coefficient value and returns the entire array of integration as the result (length ~ 10^7), efficiently.
What kind of tools should I use?
(+) I just changed my code from simple example to actual integration form that I need to solve. Sorry for making confusion.

I suspected that this integral would converge for certain values of b and c, so I tried to evaluate this using Sympy:
import sympy
sympy.init_printing()
a, b, c = sympy.symbols('a, b, c', positive=True)
x = sympy.Symbol('x', positive=True)
sympy.integrate(a*(x**(-b))*(sympy.sqrt(x**c+1)-1), (x, 0, sympy.oo))
This means that you should be able to obtain the correct results with this code as long as your coefficients pass the check function.
from numpy import sqrt, pi
from scipy.special import gamma
def check(a, b, c):
assert (-(-b + 1)/c < 1)
assert (1/2 - (-b + 1)/c > 1)
assert (1 - (-b + 1)/c > 1)
def result(a, b, c):
return a*gamma(-b/c + 1 + 1/c)*gamma(b/c - 1/2 - 1/c)/(2*sqrt(pi)*(b - 1))

Related

Find maxima for a negative parabolic equation

I have the following negative quadratic equation
-0.03402645959398278x^{2}+156.003469x-178794.025
I want to know if there is a straight way (using numpy/scipy libraries or any other) to get the value of x when the slope of the derivative is zero (the maxima). I'm aware I could:
change the sign of the equation and apply the scipy.optimize.minima method or
using the derivative of the equation so I can get the value when the slope is zero
For instance:
from scipy.optimize import minimize
quad_eq = np.poly1d([-0.03402645959398278, 156.003469, -178794.025])
############SCIPY####################
neg_quad_eq = np.poly1d(np.negative(quad_eq))
fit = minimize(neg_quad_eq, x0=15)
slope_zero_neg = fit.x[0]
maxima = np.polyval(quad_eq, slope_zero_neg)
print(maxima)
##################numpy######################
import numpy as np
first_dev = np.polyder(quad_eq)
slope_zero = first_dev.r
maxima = np.polyval(quad_eq, slope_zero)
print(maxima)
Is there any straight way to get the same result?
print(maxima)
You don't need all that code... The first derivative of a x^2 + b x + c is 2a x + b, so solving 2a x + b = 0 for x yields x = -b / (2a) that is actually the maximum you are searching for
import numpy as np
import matplotlib.pyplot as plt
def func(x, a=-0.03402645959398278, b=156.003469, c=-178794.025):
result = a * x**2 + b * x + c
return result
def func_max(a=-0.03402645959398278, b=156.003469, c=-178794.025):
maximum_x = -b / (2 * a)
maximum_y = a * maximum_x**2 + b * maximum_x + c
return maximum_x, maximum_y
x = np.linspace(-50000, 50000, 100)
y = func(x)
mx, my = func_max()
print('maximum:', mx, my)
maximum: 2292.384674478263 15.955750522436574
and verify
plt.plot(x, y)
plt.axvline(mx, color='r')
plt.axhline(my, color='r')

How to optimize my function for numeric optimization?

The code below provides a complete reproducible example. My question is (mainly) on my function thetaMax(). This is a function which minimizes a log-likelihood of a psychometric process.
I am learning Python and doing so by translating my R functions to python. The code below works as expected. However, because I am learning Python, but my question is on style and quality.
The function thetaMax would run over hundreds of thousands of individuals and in my R code is efficient and is split among multiple cores. But, before thinking about parallel processing, my first goal is make the python code as fast and efficient as possible.
There are probably many parts of my function thetaMax that can be improved, but one aspect I am most concerned about is:
for i in range(0,len(x2)):
result[i] = (gpcm(theta, d = d[i], score = x2[i], a = 1, D = D))
I think doing this as a loop is probably bad and could be improved with some form of vectorization. Below are the complete contents necessary to implement this code, and thanks to anyone willing to offer suggestions on how to improve the code.
import numpy as np
from scipy.stats import binom
from scipy.optimize import minimize
def prob3pl(theta, a, b, c, D = 1.7):
result = c + (1 - c) / (1 + np.exp(-D * a * (theta - b)))
return(result)
def gpcm(theta, d, score, a, D = 1.7):
Da = D * a
result = np.exp(np.sum(Da * (theta - d[0:score])))/np.sum(np.exp(np.cumsum(Da * (theta - d))))
return(result)
d = np.array([[0, -1, .5, 1],[0,-.5,.2,1]])
a = np.array([1,1,1,1,1])
b = np.array([-1,.5,-.5,0,2])
c = np.array([0,0,0,0,0])
x = np.array([1,1,0,1,0,1,1])
indDichot = range(0,5,1)
def thetaMax(x, indDichot, a, b, c, D, d):
x1 = x[indDichot]
x2 = np.delete(x, indDichot)
result = [0] * len(x2)
def fn(theta):
if(len(x1) > 0):
p = prob3pl(theta, a, b, c, D = D)
logDichPart = sum(np.log(binom.pmf(x1,1,p)))
else:
logPolyPart = 0
if(len(x2) > 0):
for i in range(0,len(x2)):
result[i] = (gpcm(theta, d = d[i], score = x2[i], a = 1, D = D))
logPolyPart = sum(np.log(result))
else:
logPolyPart = 0
LL = -(logDichPart + logPolyPart)
return(LL)
out = minimize(fn, x0=0)
return(out)
thetaMax(x,indDichot,a,b,c,D=1,d = d)

Python loglog graph

In the following code I have implemented composite Simpsons Rule in python. I have tested it by integrating f = sinx over [0,pi/2] and plotting the resulting absolute error as a function of n for a suitable range of integer values for n. As shown below. Now I am trying to verify that my method is of order 4. To do this instead of plotting the error vs n I need to plot n vs n^(-4) to show that both have the same slope.
from math import pi, cos, sin
from matplotlib import pyplot as plt
def simpson(f, a, b, n):
"""Approximates the definite integral of f from a to b by the composite Simpson's rule, using 2n subintervals """
h = (b - a) / (2*n)
s = f(a) + f(b)
for i in range(1, 2*n, 2):
s += 4 * f(a + i * h)
for i in range(2, 2*n-1, 2):
s += 2 * f(a + i * h)
return s * h / 3
diffs = {}
exact = 1 - cos(pi/2)
for n in range(1, 100):
result = simpson(lambda x: sin(x), 0.0, pi/2, n)
diffs[2*n] = abs(exact - result) # use 2*n or n here, your choice.
ordered = sorted(diffs.items())
x,y = zip(*ordered)
plt.autoscale()
plt.loglog(x,y)
plt.xlabel("Intervals")
plt.ylabel("Error")
plt.show()
this results in my error graph:
You can add another line to your plot by making another call to plt.loglog(). So just before your plt.show() you can add:
n = []
n_inv_4 = []
for i in range(1,100):
n.append(2*i)
n_inv_4.append(1.0 / ((2*i)**4))
plt.loglog(n, n_inv_4)
Note that the above calculations use (2*i) to match the (2*n) used in the simpson method.

Generating random numbers a, b, c such that a^2 + b^2 + c^2 = 1

To do some simulations in Python, I'm trying to generate numbers a,b,c such that a^2 + b^2 + c^2 = 1. I think generating some a between 0 and 1, then some b between 0 and sqrt(1 - a^2), and then c = sqrt(1 - a^2 - b^2) would work.
Floating point values are fine, the sum of squares should be close to 1. I want to keep generating them for some iterations.
Being new to Python, I'm not really sure how to do this. Negatives are allowed.
Edit: Thanks a lot for the answers!
According to this answer at stats.stackexchange.com, you should use normally distributed values to get uniformly distributed values on a sphere. That would mean, you could do:
import numpy as np
abc = np.random.normal(size=3)
a,b,c = abc/np.sqrt(sum(abc**2))
Just in case your interested in the probability densities I decided to do a comparison between the different approaches:
import numpy as np
import random
import math
def MSeifert():
a = 1
b = 1
while a**2 + b**2 > 1: # discard any a and b whose sum of squares already exceeds 1
a = random.random()
b = random.random()
c = math.sqrt(1 - a**2 - b**2) # fixed c
return a, b, c
def VBB():
x = np.random.uniform(0,1,3) # random numbers in [0, 1)
x /= np.sqrt(x[0] ** 2 + x[1] ** 2 + x[2] ** 2)
return x[0], x[1], x[2]
def user3684792():
theta = random.uniform(0, 0.5*np.pi)
phi = random.uniform(0, 0.5*np.pi)
return np.sin(theta)* np.cos(phi), np.sin(theta)*np.sin(phi), np.cos(theta)
def JohanL():
abc = np.random.normal(size=3)
a,b,c = abc/np.sqrt(sum(abc**2))
return a, b, c
def SeverinPappadeux():
cos_th = 2.0*random.uniform(0, 1.0) - 1.0
sin_th = math.sqrt(1.0 - cos_th*cos_th)
phi = random.uniform(0, 2.0*math.pi)
return sin_th * math.cos(phi), sin_th * math.sin(phi), cos_th
And plotting the distributions:
%matplotlib notebook
import matplotlib.pyplot as plt
f, axes = plt.subplots(3, 4)
for func_idx, func in enumerate([MSeifert, JohanL, user3684792, VBB]):
axes[0, func_idx].set_title(str(func.__name__))
res = [func() for _ in range(50000)]
for idx in range(3):
axes[idx, func_idx].hist([i[idx] for i in res], bins='auto')
axes[0, 0].set_ylabel('a')
axes[1, 0].set_ylabel('b')
axes[2, 0].set_ylabel('c')
plt.tight_layout()
With the result:
Explanation: The rows show the distributions for a, b and c respectively while the columns show the histograms (distributions) of the different approaches.
The only approaches that give a uniformly random distribution in the range (-1, 1) are JohanLs and Severin Pappadeux's approach. All other approaches have some features like spikes or a functional behavior in the range [0, 1). Note that these two solution currently gives values between -1 and 1 while all other approaches give values between 0 and 1.
I think it is actually a cool problem, and a nice way to do this is to just use spherical polar coordinates and generate the angles at random.
import random
import numpy as np
def random_pt():
theta = random.uniform(0, 0.5*np.pi)
phi = random.uniform(0, 0.5*np.pi)
return np.sin(theta)* np.cos(phi), np.sin(theta)*np.sin(phi), np.cos(theta)
You could do it like this:
import random
import math
def three_random_numbers_adding_to_one():
a = 1
b = 1
while a**2 + b**2 > 1: # discard any a and b whose sum of squares already exceeds 1
a = random.random()
b = random.random()
c = math.sqrt(1 - a**2 - b**2) # fixed c
return a, b, c
a, b, c = three_random_numbers_adding_to_one()
print(a**2 + b**2 + c**2)
However floats have only limited precision so these won't add to exactly 1, just approximately.
You may need to check if the numbers generated with this function are "random enough". It could be that this setup biases the "randomness".
The "right" answer depends on whether you are looking for a uniform random distribution in space, or on the surface of a sphere, or something else. If you are looking for points on the surface of a sphere, you still have to worry about the cos(theta) factor which will cause points to appear "bunched up" near the poles of the sphere. Since exact nature is not clear from your question, here is a "totally random" distribution that should work:
x = np.random.uniform(0,1,3) # random numbers in [0, 1)
x /= np.sqrt(x[0] ** 2 + x[1] ** 2 + x[2] ** 2)
Another advantage here is that since we are using numpy arrays, you can quickly scale to large sets of points too, by using x = np.random.uniform(0, 1, (3, n)) for any n.
Time to add another solution, heh...
This time it is truly uniform on the unit sphere point picking - check http://mathworld.wolfram.com/SpherePointPicking.html for details
import math
import random
def random_pt():
cos_th = 2.0*random.uniform(0, 1.0) - 1.0
sin_th = math.sqrt(1.0 - cos_th*cos_th)
phi = random.uniform(0, 2.0*math.pi)
return sin_th * math.cos(phi), sin_th * math.sin(phi), cos_th
for k in range(0, 100):
a, b, c = random_pt()
print("{0} {1} {2} {3}".format(a, b, c, a*a + b*b + c*c))

Way to solve constraint satisfaction faster than brute force?

I have a CSV that provides a y value for three different x values for each row. When read into a pandas DataFrame, it looks like this:
5 10 20
0 -13.6 -10.7 -10.3
1 -14.1 -11.2 -10.8
2 -12.3 -9.4 -9.0
That is, for row 0, at 5 the value is -13.6, at 10 the value is -10.7, and at 20 the value is -10.3. These values are the result of an algorithm in the form:
def calc(x, r, b, c, d):
if x < 10:
y = (x * r + b) / x
elif x >= 10 and x < 20:
y = ((x * r) + (b - c)) / x
else:
y = ((x * r) + (b - d)) / x
return y
I want to find the value of r, b, c, and d for each row. I know certain things about each of the values. For example, for each row: r is in np.arange(-.05, -.11, -.01), b is in np.arange(0, -20.05, -.05), and c and d are in np.arange(0, 85, 5). I also know that d is <= c.
Currently, I am solving this with brute force. For each row, I iterate through every combination of r, b, c, and d and test if the value at the three x values is equal to the known value from the DataFrame. This works, giving me a few combinations for each row that are basically the same except for rounding differences.
The problem is that this approach takes a long time when I need to run it against 2,000+ rows. My question is: is there a faster way than iterating and testing every combination? My understanding is that this is a constraint satisfaction problem but, after that, I have no idea what to narrow in on; there are so many types of constraint satisfaction problems (it seems) that I'm still lost (I'm not even certain that this is such a problem!). Any help in pointing me in the right direction would be greatly appreciated.
I hope i understood the task correctly.
If you know the resolution/discretization of the parameters, it looks like a discrete-optimization problem (in general: hard), which could be solved by CP-approaches.
But if you allow these values to be continuous (and reformulate the formulas), it is:
(1) a Linear Program: if checking for feasible values (there needs to be a valid solution)
(2) a Linear Program: if optimizing parameters for minimization of sum of absolute differences (=errors)
(3) a Quadratic Program: if optimizing parameters for minimization of sum of squared differences (=errors) / equivalent to minimizing euclidean-norm
All three versions can be solved efficiently!
Here is a non-general (could be easily generalized) implementation of (3) using cvxpy to formulate the problem and ecos to solve the QP. Both tools are open-source.
Code
import numpy as np
import time
from cvxpy import *
from random import uniform
""" GENERATE TEST DATA """
def sample_params():
while True:
r = uniform(-0.11, -0.05)
b = uniform(-20.05, 0)
c = uniform(0, 85)
d = uniform(0, 85)
if d <= c:
return r, b, c, d
def calc(x, r, b, c, d):
if x < 10:
y = (x * r + b) / x
elif x >= 10 and x < 20:
y = ((x * r) + (b - c)) / x
else:
y = ((x * r) + (b - d)) / x
return y
N = 2000
sampled_params = [sample_params() for i in range(N)]
data_5 = np.array([calc(5, *sampled_params[i]) for i in range(N)])
data_10 = np.array([calc(10, *sampled_params[i]) for i in range(N)])
data_20 = np.array([calc(20, *sampled_params[i]) for i in range(N)])
data = np.empty((N, 3))
for i in range(N):
data[i, :] = [data_5[i], data_10[i], data_20[i]]
""" SOLVER """
def solve(row):
""" vars """
R = Variable(1)
B = Variable(1)
C = Variable(1)
D = Variable(1)
E = Variable(3)
""" constraints """
constraints = []
# bounds
constraints.append(R >= -.11)
constraints.append(R <= -.05)
constraints.append(B >= -20.05)
constraints.append(B <= 0.0)
constraints.append(C >= 0.0)
constraints.append(C <= 85.0)
constraints.append(D >= 0.0)
constraints.append(D <= 85.0)
constraints.append(D <= C)
# formula of model
constraints.append((1.0 / 5.0) * B + R == row[0] + E[0]) # alternate function form: b/x+r
constraints.append((1.0 / 10.0) * B - (1.0 / 10.0) * C == row[1] + E[1]) # alternate function form: b/x-c/x+r
constraints.append((1.0 / 20.0) * B - (1.0 / 20.0) * D == row[2] + E[2]) # alternate function form: b/x-d/x+r
""" Objective """
objective = Minimize(norm(E, 2))
""" Solve """
problem = Problem(objective, constraints)
problem.solve(solver=ECOS, verbose=False)
return R.value, B.value, C.value, D.value, E.value
start = time.time()
for i in range(N):
r, b, c, d, e = solve(data[i])
end = time.time()
print('seconds taken: ', end-start)
print('seconds per row: ', (end-start) / N)
Output
('seconds taken: ', 20.620506048202515)
('seconds per row: ', 0.010310253024101258)

Categories