How to write a case when like statement in numpy array - python

def custom_asymmetric_train(y_true, y_pred):
residual = (y_true - y_pred).astype("float")
grad = np.where(residual>0, -2*10.0*residual, -2*residual)
hess = np.where(residual>0, 2*10.0, 2.0)
return grad, hess
I want to write this statement:
case when residual>=0 and residual<=0.5 then -2*1.2*residual
when residual>=0.5 and residual<=0.7 then -2*1.*residual
when residual>0.7 then -2*2*residual end )
however np.where cannot write &(and) logic . How do I write this case when logic in the np.where in python.
Thanks

This statement can be written using np.select as:
import numpy as np
residual = np.random.rand(10) -0.3 # -0.3 to get some negative values
condlist = [(residual>=0.0)&(residual<=0.5), (residual>=0.5)&(residual<=0.7), residual>0.7]
choicelist = [-2*1.2*residual, -2*1.0*residual,-2*2.0*residual]
residual = np.select(condlist, choicelist, default=residual)
Note that, when multiple conditions are satisfied in condlist, the first one encountered is used. When all conditions evaluate to False, it will use the default value. Moreover, for your information, you need to use bitwise operator & on boolean numpy arrays as and python keyword won't work on them.
Let's benchmark these answers:
residual = np.random.rand(10000) -0.3
def charl_3where(residual):
residual = np.where((residual>=0.0)&(residual<=0.5), -2*1.2*residual, residual)
residual = np.where((residual>=0.5)&(residual<=0.7), -2*1.0*residual, residual)
residual = np.where(residual>0.7, -2*2.0*residual, residual)
return residual
def yaco_select(residual):
condlist = [(residual>=0.0)&(residual<=0.5), (residual>=0.5)&(residual<=0.7), residual>0.7]
choicelist = [-2*1.2*residual, -2*1.0*residual,-2*2.0*residual]
residual = np.select(condlist, choicelist, default=residual)
return residual
%timeit charl_3where(residual)
>>> 112 µs ± 1.7 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
%timeit yaco_select(residual)
>>> 141 µs ± 2 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
let's try to optimize these with numba
from numba import jit
#jit(nopython=True)
def yaco_numba(residual):
out = np.empty_like(residual)
for i in range(residual.shape[0]):
if residual[i]<0.0 :
out[i] = residual[i]
elif residual[i]<=0.5 :
out[i] = -2*1.2*residual[i]
elif residual[i]<=0.7:
out[i] = -2*1.0*residual[i]
else: # residual>0.7
out[i] = -2*2.0*residual[i]
return out
%timeit yaco_numba(residual)
>>> 6.65 µs ± 123 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
Final check
res1 = charl_3where(residual)
res2 = yaco_select(residual)
res3 = yaco_numba(residual)
np.allclose(res1,res3)
>>> True
np.allclose(res2,res3)
>>> True
This one is about 15x faster than the previously best one. Hope this helps.

You can use the syntax (condition1) & (condition2) in np.where() calls, so you would modify your function's np.where() calls like so:
def custom_asymmetric_train(y_true, y_pred):
residual = (y_true - y_pred).astype("float")
residual = np.where((residual>=0.0)&(residual<=0.5), -2*1.2*residual, residual)
residual = np.where((residual>=0.5)&(residual<=0.7), -2*1.0*residual, residual)
residual = np.where(residual>0.7, -2*2.0*residual, residual)
...
The first argument is the condition to meet, the second argument is the value to use if the condition is met, the third argument is the value to use if the condition is not met.

You can also use vectorization since conditions are mutually exclusive:
residual = (y_true - y_pred).astype(float)
m1 = (residual>=0.0)&(residual<=0.5)
m2 = (residual>=0.5)&(residual<=0.7)
m3 = (residual >0.7)
new_residual = -2*(m1 *1.2 *residual + m2*residual + m3*2.0*residual)
return new_residual
This will have the following performances:
residual = np.random.rand(10000) -0.3
def speed_test(residual):
residual = (y_true - y_pred).astype(float)
m1 = (residual>=0.0)&(residual<=0.5)
m2 = (residual>=0.5)&(residual<=0.7)
m3 = residual >0.7
return -2*(m1 *1.2 *residual + m2*residual + m3*2.0*residual)
%timeit speed_test(residual)
123 µs ± 35.3 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)

Related

Estimate the rotation between two 2D point clouds

I have two 2D point clouds with an equal number of elements. For these elements I know their correspondence, i.e. for each point in PC1 I know the corresponding element in PC2 and vice versa.
I would now like to estimate the rotation between these two point clouds. That is, I would like to find the angle alpha by which I must rotate all points in PC1 around the origin such that the distance between corresponding points in PC1 and PC2 is minimized.
I can solve this using scipy's linear optimizer (see below); however, this optimization sits inside a loop along the critical path of my code and is the current bottleneck.
import numpy as np
from scipy.optimize import minimize_scalar
from math import sin, cos
# generate some data for demonstration purposes
# points in each point cloud are ordered by correspondence
num_points = 10
distance = np.random.rand(num_points) * 3
radii = np.random.rand(num_points) * 2*np.pi
pc1 = distance[:, None] * np.stack([np.cos(radii), np.sin(radii)], axis=1)
distance = np.random.rand(num_points) * 3
radii = np.random.rand(num_points) * 2*np.pi
pc2 = distance[:, None] * np.stack([np.cos(radii), np.sin(radii)], axis=1)
# solve using scipy
def score(alpha):
rot_matrix = np.array([
[cos(alpha), -sin(alpha)],
[sin(alpha), cos(alpha)]
])
pc1_rotated = (rot_matrix # pc1.T).T
sum_of_squares = np.sum((pc2 - pc1_rotated)**2, axis=1)
mse = np.mean(sum_of_squares)
return mse
# simple solution via scipy
result = minimize_scalar(
score,
bounds=(0, 2*np.pi),
method="bounded",
options={"maxiter": 1000},
)
if result.success:
print(f"Best angle: {result.x}")
else:
raise RuntimeError(f"IK failed. Reason: {result.message}")
Is there a faster (potentially analytic) solution to this problem?
Since minimize_scalar only uses derivative-free methods, the optimization runtime depends heavily on the time needed to evaluate your objective function score. Consequently, I'd recommend accelerating this function as much as possible.
Let's time your function and the optimizer as benchmark reference:
In [68]: %timeit score(0.5)
20.2 µs ± 203 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
In [69]: %timeit result = minimize_scalar(score,bounds=(0, 2*np.pi),method="bounded",options={"maxiter": 1000})
415 µs ± 7.27 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Firstly, note that (rot_matrix # pc1.T).T is the same as pc1 # rot_matrix.T, i.e. we only need to transpose one matrix instead of two.
Next, note that -sin(alpha) = cos(alpha + 5*pi/2) and sin(alpha) = cos(alpha + 3*pi/2). This means that we only need one function call of np.cos to create the rot_matrix instead of four calls of math.sin or math.cos.
Lastly, you can compute the mse more efficiently by np.einsum.
Considering all points, the function can look like this:
k1 = 5*np.pi/2
k2 = 3*np.pi/2
def score2(alpha):
rot_matrixT = np.cos((alpha, alpha+k2, alpha + k1, alpha)).reshape(2,2)
pc1_rotated = pc1 # rot_matrixT
diff = pc2 - pc1_rotated
return np.einsum('ij,ij->', diff, diff) / num_points
Timing the function again yields
In [70]: %timeit score(0.5)
9.26 µs ± 84.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
and therefore, the optimizer is much faster:
In [71]: %timeit result = minimize_scalar(score, bounds=(0, 2*np.pi), method="bounded", options={"maxiter": 1000})
279 µs ± 1.79 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
If that still is not fast enough, you can just-in-time compile your function by Numba:
In [60]: from numba import njit
In [61]: #njit
...: def score3(alpha):
...: rot_matrix = np.array([
...: [cos(alpha), -sin(alpha)],
...: [sin(alpha), cos(alpha)]
...: ])
...: pc1_rotated = (rot_matrix # pc1.T).T
...: sum_of_squares = np.sum((pc2 - pc1_rotated)**2, axis=1)
...: mse = np.mean(sum_of_squares)
...: return mse
In [62]: %timeit score3(0.5)
2.97 µs ± 47.8 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
or rewrite it using Cython. Just for the sake of completeness, here's a fast Cython implementation:
In [45]: %%cython -c=-O3 -c=-march=native -c=-Wno-deprecated-declarations -c=-Wno-#warnings
...:
...: from libc.math cimport cos, sin
...: cimport numpy as np
...: import numpy as np
...: from cython cimport wraparound, boundscheck
...:
...: #wraparound(False)
...: #boundscheck(False)
...: cpdef double score4(double alpha, double[:, ::1] pc1, double[:, ::1] pc2):
...: cdef int i
...: cdef int N = pc1.shape[0]
...: cdef double diff1 = 0.0
...: cdef double diff2 = 0.0
...: cdef double mse = 0.0
...: cdef double rmT00 = cos(alpha)
...: cdef double rmT01 = sin(alpha)
...: cdef double rmT10 = -rmT01
...: cdef double rmT11 = rmT00
...:
...: for i in range(N):
...: diff1 = pc2[i,0] - (pc1[i,0]*rmT00 + pc1[i,1]*rmT10)
...: diff2 = pc2[i,1] - (pc1[i,0]*rmT01 + pc1[i,1]*rmT11)
...: mse += diff1*diff1 + diff2*diff2
...: return mse / N
which yields
In [48]: %timeit score4(0.5, pc1, pc2)
1.05 µs ± 15.8 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
Last but not least, you can write down the first-order necessary condition of your problem and check whether it can be solved analytically. Otherwise, you can try to solve the resulting nonlinear equation numerically.

How to speed-up the Ipopt solver?

I want to solve the following (relaxed, i.e. v(t) ∈ [0, 1]) optimal control problem with cyipopt:
Here's what I have so far to solve the discretized problem:
import numpy as np
import matplotlib.pyplot as plt
from cyipopt import minimize_ipopt
from scipy.optimize._numdiff import approx_derivative
# z = (x1(t0) .... x1(tN) x2(t0) .... x2(tN) v(t0) .... v(tN))^T
def objective(z, time):
x0, x1, v = np.split(z, 3)
res = 0.0
for i in range(time.size-1):
h = time[i+1] - time[i]
res += h*((x0[i]-1)**2 + (x1[i]-1)**2)
return res
def ode_rhs(t, x, v):
x0, x1 = x
xdot1 = x0 - x0*x1 - 0.4*x0*v
xdot2 = -x1 + x0*x1 - 0.2*x1*v
return np.array([xdot1, xdot2])
def constraint(z, time):
x0, x1, v = np.split(z, 3)
x = np.array([x0, x1])
res = np.zeros((2, x0.size))
# initial values
res[:, 0] = x[:, 0] - np.array([0.5, 0.7])
# 'solve' the ode-system
for j in range(time.size-1):
h = time[j+1] - time[j]
# implicite euler scheme
res[:, j+1] = x[:, j+1] - x[:, j] - h*ode_rhs(time[j+1], x[:, j+1], v[j])
return res.flatten()
# time grid
tspan = [0, 12]
dt = 0.1
time = np.arange(tspan[0], tspan[1] + dt, dt)
# initial point
z0 = 0.1 + np.zeros(time.size*3)
# variable bounds
bnds = [(None, None) if i < 2*time.size else (0, 1) for i in range(z0.size)]
# constraints:
cons = [{
'type': 'eq',
'fun': lambda z: constraint(z, time),
'jac': lambda z: approx_derivative(lambda zz: constraint(zz, time), z)
}]
# call the solver
res = minimize_ipopt(lambda z: objective(z, time), x0=z0, bounds=bnds,
constraints=cons, options = {'disp': 5})
The code works as expected. However, it runs quite slow. Any ideas on how I can speed up the solver?
By analyzing Ipopt's output
Total CPU secs in IPOPT (w/o function evaluations) = 30.153
Total CPU secs in NLP function evaluations = 203.782
we can see that the evaluation of your functions is the bottleneck. So let's try to profile your code as Tom suggested in the comments:
In [2]: %timeit objective(z0, time)
307 µs ± 6.96 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
In [3]: %timeit constraint(z0, time)
1.38 ms ± 4.77 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Okay, not bad. But we can do better. As a rule of thumb, try to prevent loops in numerical python code whenever possible. You can find some best numpy practices e.g. in Jake VanderPlas awesome talk at the PyCon2015. Your objective is equivalent to:
def objective(z, time):
x0, x1, v = np.split(z, 3)
h = time[1:] - time[:-1]
return np.sum(h*((x0[1:]-1)**2 + (x1[1:]-1)**2))
Similarly, you can remove the loop inside your constraint function. Note that
# 'solve' the ode-system
for j in range(time.size-1):
h = time[j+1] - time[j]
# implicite euler scheme
res[:, j+1] = x[:, j+1] - x[:, j] - h*ode_rhs(time[j+1], x[:, j+1], v[j])
is the same as
h = time[1:] - time[:-1]
res[:, 1:] = x[:, 1:] - x[:, :-1] - h * ode_rhs(time, x[:, 1:], v[:-1])
Timing the functions again, we get
In [4]: %timeit objective(z0, time)
31.8 µs ± 683 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
In [5]: %timeit constraint(z0, time)
54.1 µs ± 647 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
i.e. speedups with factors 10x and 25x! Consequently, we can significantly reduce the solver runtime:
Total CPU secs in IPOPT (w/o function evaluations) = 30.906
Total CPU secs in NLP function evaluations = 46.950
However, note that calculating the gradient and jacobian numerically by finite differences is still computationally expensive and prone to rounding errors:
In [6]: %timeit approx_derivative(lambda zz: objective(zz, time), z0)
232 ms ± 3.16 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [7]: %timeit approx_derivative(lambda zz: constraint(zz, time), z0)
642 ms ± 1.13 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Instead, we can go one step further and calculate both via algorithmic differentiation (AD) by means of the jax library:
from jax.config import config
# enable 64 bit floating point precision
config.update("jax_enable_x64", True)
import jax.numpy as np
from jax import grad, jacfwd, jit
Then, we only need to change the constraint function as follows:
def constraint(z, time):
x0, x1, v = np.split(z, 3)
x = np.array([x0, x1])
res = np.zeros((2, x0.size))
# initial values
res = res.at[:, 0].set(x[:, 0] - np.array([0.5, 0.7]))
h = time[1:] - time[:-1]
res = res.at[:, 1:].set(x[:, 1:] - x[:, :-1] - h*ode_jit(time[1:], x[:, 1:], v[:-1]))
return res.flatten()
since item assignments are not supported, see here. Next, we just-in-time (jit) compile the functions:
# jit the functions
ode_jit = jit(ode_rhs)
obj_jit = jit(lambda z: objective(z, time))
con_jit = jit(lambda z: constraint(z, time))
# Build and jit the derivatives
obj_grad = jit(grad(obj_jit)) # objective gradient
con_jac = jit(jacfwd(con_jit)) # constraint jacobian
# Dummy first call in order to compile the functions
print("Compiling the functions...")
_ = obj_jit(z0), con_jit(z0), obj_grad(z0), con_jac(z0)
print("Done.")
Timing again, we obtain
In [10]: %timeit obj_grad(z0)
62.1 µs ± 353 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
In [11]: %timeit con_jac(z0)
204 µs ± 1.37 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
i.e. speedups with factors 3740x and 2260x. Finally, we can pass the exact gradient and jacobians:
# constraints:
cons = [{'type': 'eq', 'fun': con_jit, 'jac': con_jac}]
# call the solver
res = minimize_ipopt(obj_jit, x0=z0, jac=obj_grad, bounds=bnds,
constraints=cons, options={'disp': 5})
and obtain
Total CPU secs in IPOPT (w/o function evaluations) = 35.348
Total CPU secs in NLP function evaluations = 1.691

Gaussian kernel performance

Following method calculates a gaussian kernel:
import numpy as np
def gaussian_kernel(X, X2, sigma):
"""
Calculate the Gaussian kernel matrix
k_ij = exp(-||x_i - x_j||^2 / (2 * sigma^2))
:param X: array-like, shape=(n_samples_1, n_features), feature-matrix
:param X2: array-like, shape=(n_samples_2, n_features), feature-matrix
:param sigma: scalar, bandwidth parameter
:return: array-like, shape=(n_samples_1, n_samples_2), kernel matrix
"""
norm = np.square(np.linalg.norm(X[None,:,:] - X2[:,None,:], axis=2).T)
return np.exp(-norm/(2*np.square(sigma)))
# Usage example
%timeit gaussian_kernel(np.random.rand(5000, 10), np.random.rand(5000, 10), 1)
1.43 s ± 39.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
My question is: is there any ways to increase performance using numpy?
For quite small arrays you can write a simple loop implementation and compile it using Numba. For larger arrays the algebraic reformulation using np.dot() will be faster.
Example
#from version 0.43 until 0.47 this has to be set before importing numba
#Bug: https://github.com/numba/numba/issues/4689
from llvmlite import binding
binding.set_option('SVML', '-vector-library=SVML')
import numba as nb
import numpy as np
#nb.njit(fastmath=True,error_model="numpy",parallel=True)
def gaussian_kernel_2(X, X2, sigma):
res=np.empty((X.shape[0],X2.shape[0]),dtype=X.dtype)
for i in nb.prange(X.shape[0]):
for j in range(X2.shape[0]):
acc=0.
for k in range(X.shape[1]):
acc+=(X[i,k]-X2[j,k])**2/(2*sigma**2)
res[i,j]=np.exp(-1*acc)
return res
Timings
X1=np.random.rand(5000, 10)
X2=np.random.rand(5000, 10)
#Your solution
%timeit gaussian_kernel(X1,X2, 1)
#511 ms ± 10.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit gaussian_kernel_2(X1,X2, 1)
#90.1 ms ± 9.14 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
This post: https://stackoverflow.com/a/47271663/9539058 gave an answer.
Shortly, to copy the numpy part:
import numpy as np
def gaussian_kernel(X, X2, sigma):
"""
Calculate the Gaussian kernel matrix
k_ij = exp(-||x_i - x_j||^2 / (2 * sigma^2))
:param X: array-like, shape=(n_samples_1, n_features), feature-matrix
:param X2: array-like, shape=(n_samples_2, n_features), feature-matrix
:param sigma: scalar, bandwidth parameter
:return: array-like, shape=(n_samples_1, n_samples_2), kernel matrix
"""
X_norm = np.sum(X ** 2, axis = -1)
X2_norm = np.sum(X2 ** 2, axis = -1)
norm = X_norm[:,None] + X2_norm[None,:] - 2 * np.dot(X, X2.T)
return np.exp(-norm/(2*np.square(sigma)))
# Timing
%timeit gaussian_kernel(np.random.rand(5000, 10), np.random.rand(5000, 10), 1)
976 ms ± 73.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

More efficient weighted Gini coefficient in Python

Per https://stackoverflow.com/a/48981834/1840471, this is an implementation of the weighted Gini coefficient in Python:
import numpy as np
def gini(x, weights=None):
if weights is None:
weights = np.ones_like(x)
# Calculate mean absolute deviation in two steps, for weights.
count = np.multiply.outer(weights, weights)
mad = np.abs(np.subtract.outer(x, x) * count).sum() / count.sum()
rmad = mad / np.average(x, weights=weights)
# Gini equals half the relative mean absolute deviation.
return 0.5 * rmad
This is clean and works well for medium-sized arrays, but as warned in its initial suggestion (https://stackoverflow.com/a/39513799/1840471) it's O(n2). On my computer that means it breaks after ~20k rows:
n = 20000 # Works, 30000 fails.
gini(np.random.rand(n), np.random.rand(n))
Can this be adjusted to work for larger datasets? Mine is ~150k rows.
Here is a version which is much faster than the one you provided above, and also uses a simplified formula for the case without weight to get even faster results in that case.
def gini(x, w=None):
# The rest of the code requires numpy arrays.
x = np.asarray(x)
if w is not None:
w = np.asarray(w)
sorted_indices = np.argsort(x)
sorted_x = x[sorted_indices]
sorted_w = w[sorted_indices]
# Force float dtype to avoid overflows
cumw = np.cumsum(sorted_w, dtype=float)
cumxw = np.cumsum(sorted_x * sorted_w, dtype=float)
return (np.sum(cumxw[1:] * cumw[:-1] - cumxw[:-1] * cumw[1:]) /
(cumxw[-1] * cumw[-1]))
else:
sorted_x = np.sort(x)
n = len(x)
cumx = np.cumsum(sorted_x, dtype=float)
# The above formula, with all weights equal to 1 simplifies to:
return (n + 1 - 2 * np.sum(cumx) / cumx[-1]) / n
Here is some test code to check we get (mostly) the same results:
>>> x = np.random.rand(1000000)
>>> w = np.random.rand(1000000)
>>> gini_max_ghenis(x, w)
0.33376310938610521
>>> gini(x, w)
0.33376310938610382
But the speed is very different:
%timeit gini(x, w)
203 ms ± 3.68 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit gini_max_ghenis(x, w)
55.6 s ± 3.35 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
If you remove the pandas ops from the function, it is already much faster:
%timeit gini_max_ghenis_no_pandas_ops(x, w)
1.62 s ± 75 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
If you want to get the last drop of performance you could use numba or cython but that would only gain a few percent because most of the time is spent in sorting.
%timeit ind = np.argsort(x); sx = x[ind]; sw = w[ind]
180 ms ± 4.82 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
edit: gini_max_ghenis is the code used in Max Ghenis' answer
Adapting the StatsGini R function from here:
import numpy as np
import pandas as pd
def gini(x, w=None):
# Array indexing requires reset indexes.
x = pd.Series(x).reset_index(drop=True)
if w is None:
w = np.ones_like(x)
w = pd.Series(w).reset_index(drop=True)
n = x.size
wxsum = sum(w * x)
wsum = sum(w)
sxw = np.argsort(x)
sx = x[sxw] * w[sxw]
sw = w[sxw]
pxi = np.cumsum(sx) / wxsum
pci = np.cumsum(sw) / wsum
g = 0.0
for i in np.arange(1, n):
g = g + pxi.iloc[i] * pci.iloc[i - 1] - pci.iloc[i] * pxi.iloc[i - 1]
return g
This works for large vectors, at least up to 10M rows:
n = 1e7
gini(np.random.rand(n), np.random.rand(n)) # Takes ~15s.
It also produces the same result as the function provided in the question, for example giving 0.2553 for this example:
gini(np.array([3, 1, 6, 2, 1]), np.array([4, 2, 2, 10, 1]))

Perform mulitple comparisons (interval) in a Numpy array in a single pass

I have a situation similar to the "Count values in a certain range" question, but instead of a column-vector I have a matrix intervals with two columns [upper, lower] and another column vector true_values.
I want to check whether the values in the true_values vector are within the ranges defined [upper, lower], element wise.
The answer provided in the linked question would do 4 passes:
((true_values >= intervals[:, 0]) & (true_values <= intervals[:, 1])).sum()
One pass for each greater/less than check, one for the and clause, and one for the sum.
Given that these are potentially huge matrices, I'm wondering if it's possible to reduce the number of passes necessary, ideally to one pass for the interval checks, and one for the sum (I think unavoidable), I was thinking something like broadcasting a function over intervals' rows.
Here's a minimal example:
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
n_samples = 2000
n_features = 10
rng = np.random.RandomState(0)
X = rng.normal(size=(n_samples, n_features))
w = rng.normal(size=n_features)
# simple linear function without noise
y = np.dot(X, w)
gbrt = GradientBoostingRegressor(loss='quantile', alpha=0.95)
gbrt.fit(X, y)
# Get upper interval
upper_interval = gbrt.predict(X)
# Get lower interval
gbrt.set_params(alpha=0.05)
gbrt.fit(X, y)
lower_interval = gbrt.predict(X)
intervals = np.concatenate((lower_interval[:, np.newaxis], upper_interval[:, np.newaxis]), axis=1)
# This is 4 passes:
perc_correct_intervals = ((y >= intervals[:, 0]) & (y <= intervals[:, 1])).sum() / y.shape[0]
some savings with np.count_nonzero vs .sum(), more if you don't really need to make the intervals matrix for other uses
%%timeit
intervals = np.concatenate((lower_interval[:, np.newaxis], upper_interval[:, np.newaxis]), axis=1);
perc_correct_intervals = ((y >= intervals[:, 0]) & (y <= intervals[:, 1])).sum() / y.shape[0]
15.7 µs ± 78.8 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
%%timeit
np.count_nonzero(np.less(lower_interval, y)*np.less(y, upper_interval))/y.size
3.93 µs ± 28 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)

Categories