Complex vectors and shape for arrays - python

I am slightly new to python and I am trying to convert some code.This is an approximation method. Which isn't important. In my oddev function I get returned
c2[1:modes+1] = v* 1j
ValueError: could not broadcast input array from shape (25) into shape (25,1)
When I do this Matlab I believe it automatically casts it, and will store the complex array. The function is a getting the coefficient from a partial sine transform to do this. At first I tried storing the random matrix which just an array using np.matlib method and this had the same shape but I believe I will lose the real values of the filter when I cast it. How do I store this?
import math
import numpy as np
def quickcontmin(datain):
n = np.shape(datain)[0]
m = math.floor(n / 2)
modes = math.floor(m / 2)
addl = 20
nn = 20 * n
chi = 10 ** -13
def evenhp(xv):
"Even high pass"
n1 = np.shape(xv)[0]
vx = np.array(xv[:-1])
vx = vx[::-1]
c1 = np.append(xv,vx)
c1 = np.fft.fft(c1)
c1[0:modes-1] = 0.0
c1[-1 - modes + 2:-1] = 0.0
evenl = np.real(np.fft.ifft(c1))
even = evenl[0:n1-1]
return even
def evenhpt(xv):
" Transpose of EvenHP"
n1 = np.shape(xv)[0]
xy = np.zeros((n1- 2, 1))
c1 = np.append(xv,xy)
c1 = np.fft.fft(c1)
c1[0:modes-1] = 0.0
c1[-1 - modes + 1:-1] = 0.0
evenl = np.real(np.fft.ifft(c1))
even = evenl[0:n1-1]
even[1:-2] = even[1:-2] + evenl[-1:-1:n1+1]
return even``
def evenlp(xv):
" Low pass cosine filter"
n1 = np.shape(xv)[0]
vx = np.array(xv[:-1])
vx = vx[::-1]
c1 = np.append(xv,vx)
c1 = np.fft.fft(c1)
c1[modes + 1:-1 - modes + 1] = 0.0
evenl = np.real(np.fft.ifft(c1))
even = evenl[0:n1-1]
return even
def oddev(xv):
"Evaluate the sine modes on the grid"
c2 = np.zeros((2 *n - 2, 1))*1j
v = np.array(xv[:])
v1 = v[:-1]
v1 = v[::-1]
c2[1:modes+1] = v* 1j
c2[-1 - modes + 1:-1] = -v1* 1j
evall = np.fft.ifft(c2) * math.sqrt(2 * n - 2)
eva = evall[0:n-1]
return eva
def oddevt(xv):
" Transpose the sine modes on the function OddEv"
c1 = np.array(xv[1:-2])
c1 = np.insert(c1,0.0,0)
c1 = np.append(c1,0.0)
c1 = np.append(c1,xv[-2:-1:2])
c1a = np.divide(np.fft.fft(c1),math.sqrt(2 * n - 2))
fcoef = np.imag(c1a[1:modes])
return fcoef
def eextnd(xv):
"Obtain cosine coefficients and evalue on the refined grid"
vx = np.array(xv[:-1])
vx = vx[::-1]
c1 = np.append(xv,vx)
c1 = np.fft.fft(c1)
cL = np.zeros((2*nn-2,1))
cL[0:modes-1] = c1[0:modes-1]
cL[-1 - modes + 1:-1] = c1[-1 - modes + 1:-1]
evenexL = np.multiply(np.fft.ifft(cL) , (nn - 1) / (n - 1))
evenex = evenexL[0:nn-1]
return evenex
def oextnd(xv):
"Evaluate sine coefficients on the refined grid"
c2 = np.zeros((2 * nn - 2, 1))
c2[0] = 0.0
c2[1:modes + 1] = np.multiply(xv[0:-1],1j)
c2[-1 - modes + 1:-1] = np.multiply(-xv[-1:-1:1],1j)
evall = np.real(np.multiply(np.fft.ifft(c2), math.sqrt(2 * n - 2) * (2 *nn - 2) / (2 * n - 2)))
oox = evall[0:nn-1]
return oox
dc = evenlp(datain)
#L in paper, number of vectors used to sample the columnspace
lll = round(4 * math.log(m )/ math.log(2)) + addl
lll = int(lll)
#The following should be straightforward from the psuedo-code
w=2 * np.random.rand(modes , lll) - 1
p=np.matlib.zeros(shape=(n,lll))
for j in range(lll):
p[:,j] = evenhp(oddev(w[:,j]))
q,r = np.linalg.qr(p , mode='reduced')
z = np.zeros(shape=(modes,lll))
for j in range(lll):
z[:,j]= oddevt(evenhpt(q[:,j]))
un,s,v = np.linalg.svd(z,full_matrices='False')
ds=np.diag(s)
aa=np.extract(np.diag(s)>(chi))
aa[-1] = aa
aa = int(aa)
s = 0 * s
for j in range(aa):
s[j,j] = 1.0 / ds(j)
#find the sine coefficents
b=un*s* v.T* q.T* evenhp(datain)
#Constructing the continuation
exs=oddev(b)
pexs = evenlp(exs)
dataCont=exs-pexs+dc
dataCont[n+1:2*n-2]=-exs[-2:-1:1]-pexs[-2:-1:1]+dc[-2:-1:1]
#Evaluate the continuation on the refined grid
dataRefined=eextnd(dc-exs)+oextnd(b)
return dataRefined, dataCont
n1 = 100
t = np.linspace(0,2*math.pi,n1)
y = np.sin(t)
data = quickcontmin(y)
dc1 = data[1]
dc1 = dc1[0:n1-1]`

Replacing c2[1:modes+1] = v* 1j by c2[1:modes+1, 0] = v* 1j should fix that specific error.
More consistent would be to replace:
v = np.array(xv[:])
v1 = v[:-1]
v1 = v[::-1]
by
v = xv
v1 = v[:-1]
v is already a column vector so you don't need to transform it into a 1d vector when you later need a column vector.

Related

Runge Kutta 4th order Python

I am trying to solve this equation using Runge Kutta 4th order:
applying d2Q/dt2=F(y,x,v) and dQ/dt=u Q=y in my program.
I try to run the code but i get this error:
Traceback (most recent call last):
File "C:\Users\Egw\Desktop\Analysh\Askhsh1\asdasda.py", line 28, in <module>
k1 = F(y, u, x) #(x, v, t)
File "C:\Users\Egw\Desktop\Analysh\Askhsh1\asdasda.py", line 13, in F
return ((Vo/L -(R0/L)*u -(R1/L)*u**3 - y*(1/L*C)))
OverflowError: (34, 'Result too large')
I tried using the decimal library but I still couldnt make it work properly.I might have not used it properly tho.
My code is this one:
import numpy as np
from math import pi
from numpy import arange
from matplotlib.pyplot import plot, show
#parameters
R0 = 200
R1 = 250
L = 15
h = 0.002
Vo=1000
C=4.2*10**(-6)
t=0.93
def F(y, u, x):
return ((Vo/L -(R0/L)*u -(R1/L)*u**3 - y*(1/L*C)))
xpoints = arange(0,t,h)
ypoints = []
upoints = []
y = 0.0
u = Vo/L
for x in xpoints:
ypoints.append(y)
upoints.append(u)
m1 = u
k1 = F(y, u, x) #(x, v, t)
m2 = h*(u + 0.5*k1)
k2 = (h*F(y+0.5*m1, u+0.5*k1, x+0.5*h))
m3 = h*(u + 0.5*k2)
k3 = h*F(y+0.5*m2, u+0.5*k2, x+0.5*h)
m4 = h*(u + k3)
k4 = h*F(y+m3, u+k3, x+h)
y += (m1 + 2*m2 + 2*m3 + m4)/6
u += (k1 + 2*k2 + 2*k3 + k4)/6
plot(xpoints, upoints)
show()
plot(xpoints, ypoints)
show()
I expected to get the plots of u and y against t.
Turns out I messed up with the equations I was using for Runge Kutta
The correct code is the following:
import numpy as np
from math import pi
from numpy import arange
from matplotlib.pyplot import plot, show
#parameters
R0 = 200
R1 = 250
L = 15
h = 0.002
Vo=1000
C=4.2*10**(-6)
t0=0
#dz/dz
def G(x,y,z):
return Vo/L -(R0/L)*z -(R1/L)*z**3 - y/(L*C)
#dy/dx
def F(x,y,z):
return z
t = np.arange(t0, 0.93, h)
x = np.zeros(len(t))
y = np.zeros(len(t))
z = np.zeros(len(t))
y[0] = 0.0
z[0] = 0
for i in range(1, len(t)):
k0=h*F(x[i-1],y[i-1],z[i-1])
l0=h*G(x[i-1],y[i-1],z[i-1])
k1=h*F(x[i-1]+h*0.5,y[i-1]+k0*0.5,z[i-1]+l0*0.5)
l1=h*G(x[i-1]+h*0.5,y[i-1]+k0*0.5,z[i-1]+l0*0.5)
k2=h*F(x[i-1]+h*0.5,y[i-1]+k1*0.5,z[i-1]+l1*0.5)
l2=h*G(x[i-1]+h*0.5,y[i-1]+k1*0.5,z[i-1]+l1*0.5)
k3=h*F(x[i-1]+h,y[i-1]+k2,z[i-1]+l2)
l3 = h * G(x[i - 1] + h, y[i - 1] + k2, z[i - 1] + l2)
y[i]=y[i-1]+(k0+2*k1+2*k2+k3)/6
z[i] = z[i - 1] + (l0 + 2 * l1 + 2 * l2 + l3) / 6
Q=y
I=z
plot(t, Q)
show()
plot(t, I)
show()
If I may draw your attention to these 4 lines
m1 = u
k1 = F(y, u, x) #(x, v, t)
m2 = h*(u + 0.5*k1)
k2 = (h*F(y+0.5*m1, u+0.5*k1, x+0.5*h))
You should note a fundamental structural difference between the first two lines and the second pair of lines.
You need to multiply with the step size h also in the first pair.
The next problem is the step size and the cubic term. It contributes a term of size 3*(R1/L)*u^2 ~ 50*u^2 to the Lipschitz constant. In the original IVP per the question with u=Vo/L ~ 70 this term is of size 2.5e+5. To compensate only that term to stay in the stability region of the method, the step size has to be smaller 1e-5.
In the corrected initial conditions with u=0 at the start the velocity u remains below 0.001 so the cubic term does not determine stability, this is now governed by the last term contributing a Lipschitz term of 1/sqrt(L*C) ~ 125. The step size for stability is now 0.02, with 0.002 one can expect quantitatively useful results.
You can use decimal libary for more precision (handle more digits), but it's kind of annoying every value should be the same class (decimal.Decimal).
For example:
import numpy as np
from math import pi
from numpy import arange
from matplotlib.pyplot import plot, show
# Import decimal.Decimal as D
import decimal
from decimal import Decimal as D
# Precision
decimal.getcontext().prec = 10_000_000
#parameters
# Every value should be D class (decimal.Decimal class)
R0 = D(200)
R1 = D(250)
L = D(15)
h = D(0.002)
Vo = D(1000)
C = D(4.2*10**(-6))
t = D(0.93)
def F(y, u, x):
# Decomposed for use D
a = D(Vo/L)
b = D(-(R0/L)*u)
c = D(-(R1/L)*u**D(3))
d = D(-y*(D(1)/L*C))
return ((a + b + c + d ))
xpoints = arange(0,t,h)
ypoints = []
upoints = []
y = D(0.0)
u = D(Vo/L)
for x in xpoints:
ypoints.append(y)
upoints.append(u)
m1 = u
k1 = F(y, u, x) #(x, v, t)
m2 = (h*(u + D(0.5)*k1))
k2 = (h*F(y+D(0.5)*m1, u+D(0.5)*k1, x+D(0.5)*h))
m3 = h*(u + D(0.5)*k2)
k3 = h*F(y+D(0.5)*m2, u+D(0.5)*k2, x+D(0.5)*h)
m4 = h*(u + k3)
k4 = h*F(y+m3, u+k3, x+h)
y += (m1 + D(2)*m2 + D(2)*m3 + m4)/D(6)
u += (k1 + D(2)*k2 + D(2)*k3 + k4)/D(6)
plot(xpoints, upoints)
show()
plot(xpoints, ypoints)
show()
But even with ten million of precision I still get an overflow error. Check the components of the formula, their values are way too high. You can increase precision for handle them, but you'll notice it takes time to calculate them.
Problem implementation using scipy.integrate.odeint and scipy.integrate.solve_ivp.
import numpy as np
import matplotlib.pyplot as plt
from scipy.integrate import odeint, solve_ivp
# Input data initial conditions
ti = 0.0
tf = 0.5
N = 100000
h = (tf-ti)/N
# Initial conditions
u0 = 0.0
Q0 = 0.0
t_span = np.linspace(ti,tf,N)
r0 = np.array([Q0,u0])
# Parameters
R0 = 200
R1 = 250
L = 15
C = 4.2*10**(-6)
V0 = 1000
# Systems of First Order Equations
# This function is used with odeint, as specified in the documentation for scipy.integrate.odeint
def f(r,t,R0,R1,L,C,V0):
Q,u = r
ode1 = u
ode2 = -((R0/L)*u)-((R1/L)*u**3)-((1/(L*C))*Q)+(V0/L)
return np.array([ode1,ode2])
# This function is used in our 4Order Runge-Kutta implementation and in scipy.integrate.solve_ivp
def F(t,r,R0,R1,L,C,V0):
Q,u = r
ode1 = u
ode2 = -((R0/L)*u)-((R1/L)*u**3)-((1/(L*C))*Q)+(V0/L)
return np.array([ode1,ode2])
# Resolution with oedint
sol_1 = odeint(f,r0,t_span,args=(R0,R1,L,C,V0))
sol_2 = solve_ivp(fun=F,t_span=(ti,tf), y0=r0, method='LSODA',args=(R0,R1,L,C,V0))
Q_odeint, u_odeint = sol_1[:,0], sol_1[:,1]
Q_solve_ivp, u_solve_ivp = sol_2.y[0,:], sol_2.y[1,:]
# Figures
plt.figure(figsize=[30.0,10.0])
plt.subplot(3,1,1)
plt.grid(color = 'red',linestyle='--',linewidth=0.4)
plt.plot(t_span,Q_odeint,'r',t_span,u_odeint,'b')
plt.xlabel('t(s)')
plt.ylabel('Q(t), u(t)')
plt.subplot(3,1,2)
plt.plot(sol_2.t,Q_solve_ivp,'g',sol_2.t,u_solve_ivp,'y')
plt.grid(color = 'yellow',linestyle='--',linewidth=0.4)
plt.xlabel('t(s)')
plt.ylabel('Q(t), u(t)')
plt.subplot(3,1,3)
plt.plot(Q_solve_ivp,u_solve_ivp,'green')
plt.grid(color = 'yellow',linestyle='--',linewidth=0.4)
plt.xlabel('Q(t)')
plt.ylabel('u(t)')
plt.show()
Runge-Kutta 4th
# Code development of Runge-Kutta 4 Order
# Parameters
R0 = 200
R1 = 250
L = 15
C = 4.2*10**(-6)
V0 = 1000
# Input data initial conditions #
ti = 0.0
tf = 0.5
N = 100000
h = (tf-ti)/N
# Initial conditions
u0 = 0.0
Q0 = 0.0
# First order ordinary differential equations
def f1(t,Q,u):
return u
def f2(t,Q,u):
return -((R0/L)*u)-((R1/L)*u**3)-((1/(L*C))*Q)+(V0/L)
t = np.zeros(N); Q = np.zeros(N); u = np.zeros(N)
t[0] = ti
Q[0] = Q0
u[0] = u0
for i in range(0,N-1,1):
k1 = h*f1(t[i],Q[i],u[i])
l1 = h*f2(t[i],Q[i],u[i])
k2 = h*f1(t[i]+(h/2),Q[i]+(k1/2),u[i]+(l1/2))
l2 = h*f2(t[i]+(h/2),Q[i]+(k1/2),u[i]+(l1/2))
k3 = h*f1(t[i]+(h/2),Q[i]+(k2/2),u[i]+(l2/2))
l3 = h*f2(t[i]+(h/2),Q[i]+(k2/2),u[i]+(l2/2))
k4 = h*f1(t[i]+h,Q[i]+k3,u[i]+l3)
l4 = h*f2(t[i]+h,Q[i]+k3,u[i]+l3)
Q[i+1] = Q[i] + ((k1+2*k2+2*k3+k4)/6)
u[i+1] = u[i] + ((l1+2*l2+2*l3+l4)/6)
t[i+1] = t[i] + h
plt.figure(figsize=[20.0,10.0])
plt.subplot(1,2,1)
plt.plot(t,Q_solve_ivp,'r',t,Q_odeint,'y',t,Q,'b')
plt.grid(color = 'yellow',linestyle='--',linewidth=0.4)
plt.xlabel('t(s)')
plt.ylabel(r'$Q(t)_{Odeint}$, $Q(t)_{RK4}$')
plt.subplot(1,2,2)
plt.plot(t,Q_solve_ivp,'g',t,Q_odeint,'y',t,Q,'b')
plt.grid(color = 'yellow',linestyle='--',linewidth=0.4)
plt.xlabel('t(s)')
plt.ylabel(r'$Q(t)_{solve_ivp}$, $Q(t)_{RK4}$')

Vectorizing a Vector-Valued Function

I have the following function:
def h_Y1(X, theta):
EF = X[0]
FF = X[1]
j = X[-2]
k = X[-1]
W = X[2:-2]
Sigma = theta[0]
sigma_xi2 = theta[1]
gamma_alpha = theta[2]
gamma_z = np.array(theta[3:])
gW = gamma_z # W
eps1 = EF - gamma_alpha * gW
if j == k:
eps2 = FF - (gamma_alpha**2)*gW - gW*sigma_xi2 - Sigma
eps3 = 0
else:
eps2 = 0
eps3 = FF - (gamma_alpha**2)*gW
h1 = [eps1 * Wk for Wk in W]
h2 = [eps2 * Wk for Wk in W]
h3 = [eps3 * Wk for Wk in W]
return np.concatenate([h1, h2, h3])
I need to execute the function for a range of values, which are stored in gmmarray. Specifically, I'd like the function to be run for each row of gmmarray as the function argument X, for a fixed theta.
I'm currently doing this using the following code:
import numpy as np
theta = [0.01, 1, 1, 0, 0]
gmmarray = np.random.random((1120451, 6))
test = np.apply_along_axis(h_Y1, 1, gmmarray, theta = init)
However, this is slow - it takes around 19 seconds. I tried vectorizing the function as follows:
Vh_Y1 = np.vectorize(h_Y1, signature = '(n),(j)->(i)')
test1 = Vh_Y1(gmmarray, init)
However, this still takes 16 seconds. Am I doing something wrong here or is there a way to speed things up further?
Thanks so much!
You can pass the full gmmarray as the X parameter. Then, instead of looping through each row of gmmarray, you can use vectorized operations on its columns.
Something like this:
def h_Y1_vectorized(X, theta):
EF, FF, W, j, k = np.hsplit(X, [1,2,4,5]) # Column vectors (except W)
Sigma, sigma_xi2, gamma_alpha, *gamma_z = theta
gW = (W # gamma_z)[:, None] # Ensure column vector
ga_gW = gamma_alpha * gW
FF_ga2gW = FF - gamma_alpha * ga_gW
eps1 = EF - ga_gW
j_equal_k = j == k
eps2 = np.where(j_equal_k, FF_ga2gW - gW * sigma_xi2 - Sigma, 0)
eps3 = np.where(j_equal_k, 0, FF_ga2gW)
h1 = eps1 * W
h2 = eps2 * W
h3 = eps3 * W
return np.hstack([h1, h2, h3])
Calling
>>> h_Y1_vectorized(gmmarray, theta)
produces the same result with roughly a 100x speed increase.

How to improve convergence while solving a large system of non linear equations

I am trying to solve a set of 10 equations with 10 unknown variables and some of the equations are non linear.
I have tried to use fsolve and opt.least_squares solvers but I have a problem of convergence, highly dependent on the value of one coefficient "alpha" and from the initial guess.
For instance:
When alpha=1, the "cost" is around 1e-2.
When alpha=1e3 (which I assume to be the right value), the "cost" is around 1e7.
The problem is that I don't have an idea of what could be the initial guess.
As it is a Physics project, the only thing I know is that the x_j (from 0 to 7) have to be between 0 and 3.
This is why I prefer to use opt.least_squares which allow me to put bounds for these parameters.
Does anyone have an idea how I could improve the initial guess with so many equations and variables ?
Here is my code:
import numpy as np
import scipy.optimize as opt
# first set of values
C = 50.6
H = 6.5
O = 42
N = 0.2
Moist = 14.7
FC = 19.2
ER = 0.3
Ash = 0.7
a = 1
b = 12*H/(C*1)
c = (O*12)/(C*16)
d = 0.02
M_f = 12 + 1 * b + 16 * c
M_H2O = 1 * 2 + 16
e = (Moist)/((100-Moist)*(100-Ash))*(M_f/M_H2O)
f = ER*(1+(b/4)-(c/2))
Kc = FC/C
# second set of values
alpha = 1
Gf_j = np.zeros(6, dtype=float)
Gf_j = np.array([0,-394.4,-137.3,-50.8,-237.2,0]) * alpha
R = 8.134
T = 1373
# system of non linear equations
def equations(var):
x_j = np.zeros(7, dtype=float)
x_j[0], x_j[1], x_j[2], x_j[3], x_j[4], x_j[5], x_j[6], lambda_C, lambda_O, lambda_H = var
xt = sum(abs(x_j))
eq1 = a - abs(x_j[1]) - abs(x_j[2]) - abs(x_j[3]) - abs(x_j[6])
eq2 = b + 2*e - 4*abs(x_j[3]) - 2*abs(x_j[4]) - 2*abs(x_j[5])
eq3 = c + e + 2*f - 2*abs(x_j[1]) - abs(x_j[2]) - abs(x_j[4])
eq4 = d + 7.52*f - 2*abs(x_j[0])
eq5 = Gf_j[1] + R*T*np.log(abs(x_j[1]/xt)) + lambda_C + 2*lambda_O
eq6 = Gf_j[2] + R*T*np.log(abs(x_j[2]/xt)) + lambda_C + lambda_O
eq7 = Gf_j[3] + R*T*np.log(abs(x_j[3]/xt)) + lambda_C + 4*lambda_H
eq8 = Gf_j[4] + R*T*np.log(abs(x_j[4]/xt)) + 2*lambda_H + lambda_O
eq9 = Gf_j[5] + R*T*np.log(abs(x_j[5]/xt)) + 2*lambda_H
eq10 = x_j[3] + x_j[6] - Kc
return(eq1,eq2,eq3,eq4,eq5,eq6,eq7,eq8,eq9,eq10)
# # # use of fsolve
tt=10000
guess1 = np.array([1.2,0.3,0.4,0.1,0.3,0.4,1,tt,4*tt,0.5*tt])
sol = opt.fsolve(equations, guess1)
# display the solution
print(sol)
# check the result
x_j = np.zeros(7, dtype=float)
x_j = abs(sol[:7])
lambda_C = sol[7]
lambda_O = sol[8]
lambda_H = sol[9]
xt = sum(abs(x_j))
eq= equations(sol)
# # # use of least squares
guess = np.array([1.2,0.2,0.4,0.1,0.4,0.4,0.5,1e3,1e4,1e4])
bounds = ((1,0.1,0.1,0.02,0.1,0.1,0.1,-1e4,-1e4,-1e4), (1.5,1,1,1,1,1,2,1e5,1e5,1e5))
mod_sol2 = opt.least_squares(equations, guess, bounds = bounds)
sol2 = mod_sol2.x
# display the solution
print(sol2)
# check the result
x_j = np.zeros(7, dtype=float)
x_j = abs(sol2[:7])
lambda_C = sol2[7]
lambda_O = sol2[8]
lambda_H = sol2[9]
xt = sum(abs(x_j))
eq= equations(sol2)
print("cost:",mod_sol2.cost)
Thanks

Speed up matrix calculations(looping over subarrays) [numpy]

I'm trying to write an algorithm for my student work, it is working well. However, it takes a long time to calculate, especially with big arrays.
This part of code is slowing down all program.
Shapes: X.shape = mask.shape = logBN.shape = (500,500,1000),
F.shape = (20,20),
A.shape = (481,481),
s2 -- scalar.
How should I change this code to make it faster?
h = F.shape[0]
w = F.shape[1]
q = np.zeros((A.shape[0], A.shape[1], X.shape[2]))
for i in range(A.shape[0]):
for j in range(A.shape[1]):
mask[:,:,:] = 0
mask[i:i + h,j:j + w,:] = 1
q[i,j,:] = ((logBN*(1 - mask)).sum(axis=(0,1)) +
(np.log(norm._pdf((X[i:i + h,j:j + w,:]-F[:,:,np.newaxis])/s2)/s2)).sum(axis=(0,1))
After heavy juggling through algebraic operations of log, exp, power, it all came to this -
# Params
m,n = F.shape[:2]
k1 = 1.0/(s2*np.sqrt(2*np.pi))
k2 = -0.5/s2**2
k3 = np.log(k1)*m*n
out = np.zeros((A.shape[0], A.shape[1], X.shape[2]))
for i in range(A.shape[0]):
for j in range(A.shape[1]):
mask[:] = 1
mask[i:i + h,j:j + w,:] = 0
XF = (X[i:i + h,j:j + w,:]-F[:,:,np.newaxis])
p1 = np.einsum('ijk,ijk->k',logBN,mask)
p2 = k2*np.einsum('ijk,ijk->k',XF,XF)
out[i,j,:] = p1 + p2
out += k3
Few things used were -
1] norm._pdf is basically : norm.pdf(x) = exp(-x**2/2)/sqrt(2*pi). So, we could inline the implementation and optimize those at the script level.
2] The division by scalars won't be efficient, so those were replaced by multiplications by their reciprocals. So, as a pre-processing store their reciprocals before going into the loop.
Just trying to make sense of your inner loop
mask[:,:,:] = 0
mask[i:i + h,j:j + w,:] = 1
q[i,j,:] = ((logBN*(1 - mask)).sum(axis=(0,1)) +
(np.log(norm._pdf((X[i:i + h,j:j + w,:]-F[:,:,np.newaxis])/s2)/s2)).sum(axis=(0,1))
looks like
idx = (slice(i,i+h), slice(j,j_w), slice(None))
mask = np.zeros(X.shape)
mask(idx) = 1
mask = 1 - mask
# alt mask=np.ones(X.shape);mask[idx]=0
term1 = (logBN*mask).sum(axis=(0,1))
term2 = np.log(norm._pdf((X[idx] - F[...,None])/s2)/s2).sum(axis=(0,1))
q[i,j,:] = term1 + term2
So idx and mask define a subarray in A. You are using logBN outside the array; and term inside it. You are summing values on 1st 2 dim, so both term1 and term2 has shape X.shape[2], which you save in q.
That mask/window is 20x20.
As a first cut I'd try to calculate that term2 for all i,j at once. That looks like a typical sliding window problem. I'd also try to express the term1 as a subtraction - the whole logBN minus this window.

python performance tips for looping calculation

Python 2.7, Windows 7.
I'm looking for tips on how to make a calculation heavy script run faster. First an idea of what I'm doing:
Starting with a given color, I want to generate a list of 30 more colors (rgb values) that are maximally distinctive to the human eye from one another, and with the front of the list more distinctive than the end.
Currently, I estimate that the script will take ~48 hours to complete. I could let it run over the weekend, but I figured I would take the opportunity to learn something about python performance.
An overview what the code does:
gen_colours() contains a loop that runs 30 times. Each time 4 processes run multi(n, l, g) which contains the big loop iterating over each r, g, and b value between 0 and 255 (r value is split between processes so it loops 64 times). The inner most loop contains another loop that checks the rgb value against rgb values already found by calling compute_dist([r, g, b], c).
Anyways, without completely restructuring my code, things to help speed it up would be cool. Also, running all four cpus at max for 48 hours...issues there?
Code:
from math import sqrt, pow, atan2, atan, sin, cos, exp, radians, degrees
from fractions import Fraction
import time
import multiprocessing
def to_xyz(rgb):
r = rgb[0] / 255.0
g = rgb[1] / 255.0
b = rgb[2] / 255.0
f = Fraction(12, 5)
if r > 0.04045:
r = ((r + 0.055) / 1.055) ** f
else:
r /= 12.92
if g > 0.04045:
g = ((g + 0.055) / 1.055) ** f
else:
g /= 12.92
if b > 0.04045:
b = ((b + 0.055) / 1.055) ** f
else:
b /= 12.92
r *= 100
g *= 100
b *= 100
# Observer = 2 degrees, Illuminant = D65
x = r * 0.4124 + g * 0.3576 + b * 0.1805
y = r * 0.2126 + g * 0.7152 + b * 0.0722
z = r * 0.0193 + g * 0.1192 + b * 0.9505
return [x, y, z]
def to_lab(xyz):
x = xyz[0]
y = xyz[1]
z = xyz[2]
# Observer= 2deg, Illuminant= D65
x /= 95.047
y /= 100.0
z /= 108.883
f = Fraction(1, 3)
if x > 0.008856:
x **= f
else:
x = 7.787 * x + 0.13793103448
if y > 0.008856:
y **= f
else:
y = 7.787 * y + 0.13793103448
if z > 0.008856:
z **= f
else:
z = 7.787 * z + 0.13793103448
L = 116 * y - 16
a = 500 * (x - y)
b = 200 * (y - z)
return [L, a, b]
def compute_dist(rgb1, rgb2):
""" Compute the apparent difference in colours using CIEDE2000 standards """
xyz1 = to_xyz(rgb1)
xyz2 = to_xyz(rgb2)
lab1 = to_lab(xyz1)
lab2 = to_lab(xyz2)
a1 = lab1[1]
a2 = lab2[1]
b1 = lab1[2]
b2 = lab2[2]
L1 = lab1[0]
L2 = lab2[0]
c1 = sqrt(a1 * a1 + b1 * b1)
c2 = sqrt(a2 * a2 + b2 * b2)
c = (c1 + c2) / 2
crs = c ** 7
x = 0.5 - 0.5 * sqrt(crs / (crs + 6103515625))
temp = (1 + x) * a1
c1 = sqrt(temp * temp + b1 * b1)
h1 = hue(temp, b1)
temp = (1 + x) * a2
c2 = sqrt(temp * temp + b2 * b2)
h2 = hue(temp, b2)
dL = L2 - L1
dc = c2 - c1
if c1 * c2 == 0:
dh = 0
else:
temp = round(h2 - h1, 12)
if abs(temp) <= 180:
dh = h2 - h1
else:
if temp > 180:
dh = h2 - h1 - 360
else:
dh = h2 - h1 + 360
dh = sqrt(c1 * c2) * sin(radians(dh / 2))
dh += dh
lav = (L1 + L2) / 2
cav = (c1 + c2) / 2
if c1 * c2 == 0:
htot = h1 + h2
else:
temp = abs(round(h1 - h2, 12))
if temp > 180:
if h2 + h1 < 360:
htot = h1 + h2 + 360
else:
htot = h1 + h2 - 360
else:
htot = h1 + h2
htot /= 2
T = 1 - 0.17 * cos(radians(htot - 30)) + 0.24 * cos(radians(2 * htot)) + 0.32 * cos(radians(3 * htot + 6)) - 0.20 * cos(radians(4 * htot - 63))
htotdtme = (htot / 25) - 11
xPH = 30 * exp(-htotdtme * htotdtme)
cavrs = cav ** 7
scocp = sqrt(cavrs / (cavrs + 6103515625))
xRC = scocp + scocp
lavmf = lav - 50
lavmfs = lavmf * lavmf
SL = 1 + 0.015 * lavmfs / sqrt(20 + lavmfs)
SC = 1 + 0.045 * cav
SH = 1 + 0.015 * cav * T
RT = -sin(radians(xPH + xPH)) * xRC
dL /= SL
dc /= SC
dh /= SH
dE = sqrt(dL * dL + dc * dc + dh * dh + RT * dc * dh)
return dE
def hue(a, b): # Function returns CIELAB-Hue value
c = 0
if a >= 0 and b == 0:
return 0
if a < 0 and b == 0:
return 180
if a == 0 and b > 0:
return 90
if a == 0 and b < 0:
return 270
if a > 0 and b > 0:
c = 0
elif a < 0:
c = 180
elif b < 0:
c = 360
return degrees(atan(b / a)) + c
def multi(p, l, q):
f = 0
n = []
s = p * 64
e = (p + 1) * 64
for r in xrange(s, e):
for g in xrange(256):
for b in xrange(256):
s = 1000 # smallest dist
for c in l: # compare to existing colours
d = compute_dist([r, g, b], c)
if d < s:
s = d
if s > f:
n = [r, g, b]
f = s
q.put(f)
q.put(n)
def gen_colours(start_col=[68, 68, 68]):
out = open('colour_output.txt', 'w')
l = [start_col]
if __name__ == '__main__':
q0 = multiprocessing.Queue()
q1 = multiprocessing.Queue()
q2 = multiprocessing.Queue()
q3 = multiprocessing.Queue()
for h in xrange(30): # create 30 more colours
p0 = multiprocessing.Process(target=multi, args=[0, l, q0])
p1 = multiprocessing.Process(target=multi, args=[1, l, q1])
p2 = multiprocessing.Process(target=multi, args=[2, l, q2])
p3 = multiprocessing.Process(target=multi, args=[3, l, q3])
p0.start()
p1.start()
p2.start()
p3.start()
p0.join()
p1.join()
p2.join()
p3.join()
d0 = q0.get()
d1 = q1.get()
d2 = q2.get()
d3 = q3.get()
c0 = q0.get()
c1 = q1.get()
c2 = q2.get()
c3 = q3.get()
d = [d0, d1, d2, d3]
c = [c0, c1, c2, c3]
m = max(d)
i = d.index(m)
n = c[i]
l.append(n)
out.write("[" + str(n[0]) + ", " + str(n[1]) + ", " + str(n[2]) + "]\n")
print "\nnew colour added: " + str(l)
out.close()
print "Done"
gen_colours()
Any tips?
Edit:
An obvious improvement is the fact that I'm calculating Lab values on found rgb colors every time. I added a list to store Lab values for these so that it doesn't need to do this each loop. This reduced time by about 1/4. Not a Python performance improvement that I'm looking for however.
I'm sure a color that is R:100 G:100 B:101 would not be a "maximally distinctive" solution if color R:100 G:100 B:100 is chosen already.
One quick improvement you could make is to omit checking colors which are similar (ie. R and G values which are the same that have a B value within a given range).
You're doing way too much work.
It appears that you are working in a 24-bit RGB color-space when most monitor/gamut/ambient/eye combinations afford far less discriminability than your CIE calculations produce.
Assuming you are doing this to provide real-world colors for real-eyes, you also have to account for the myriad forms of colorblindness which reduces you to less than 12-bit useful color-space. Even if we are just talking about luminance, as luminance approaches the lower third of the device gamut, noticable differences become ever sparser.
The problem that you have is algorithmic; that is, you are working too hard to get detailed results when the added detail is irrelevant. Between #000 and #fff, there are only 4096 possible colors and the red-green axis can be rejected out of hand.

Categories