I have a 3D array that I need to interpolate over one axis (the last dimension). Let's say y.shape = (nx, ny, nz), I want to interpolate in nz for every (nx, ny). However, I want to interpolate for a different value in each [i, j].
Here's some code to exemplify. If I wanted to interpolate to a single value, say new_z, I'd use scipy.interpolate.interp1d like this
# y is a 3D ndarray
# x is a 1D ndarray with the abcissa values
# new_z is a number
f = scipy.interpolate.interp1d(x, y, axis=-1, kind='linear')
result = f(new_z)
However, for this problem what I actually want is to interpolate to a different new_z for each y[i, j]. So I do this:
# y is a 3D ndarray
# x is a 1D ndarray with the abcissa values
# new_z is a 2D array
result = numpy.empty(y.shape[:-1])
for i in range(nx):
for j in range(ny):
f = scipy.interpolate.interp1d(x, y[i, j], axis=-1, kind='linear')
result[i, j] = f(new_z[i, j])
Unfortunately, with multiple loops this becomes inefficient and slow. Is there a better way to do this kind of interpolation? Linear interpolation is sufficient. A possibility is to implement this in Cython, but I was trying to avoid that because I want to have the flexibility of changing to cubic interpolation and don't want to do it by hand in Cython.
To speedup high order interpolate, you can call interp1d() only once, and then use the _spline attribute and the low level function _bspleval() in the _fitpack module. Here is the code:
from scipy.interpolate import interp1d
import numpy as np
nx, ny, nz = 30, 40, 50
x = np.arange(0, nz, 1.0)
y = np.random.randn(nx, ny, nz)
new_x = np.random.random_integers(1, (nz-1)*10, size=(nx, ny))/10.0
def original_interpolation(x, y, new_x):
result = np.empty(y.shape[:-1])
for i in xrange(nx):
for j in xrange(ny):
f = interp1d(x, y[i, j], axis=-1, kind=3)
result[i, j] = f(new_x[i, j])
return result
def fast_interpolation(x, y, new_x):
from scipy.interpolate._fitpack import _bspleval
f = interp1d(x, y, axis=-1, kind=3)
xj,cvals,k = f._spline
result = np.empty_like(new_x)
for (i, j), value in np.ndenumerate(new_x):
result[i, j] = _bspleval(value, x, cvals[:, i, j], k, 0)
return result
r1 = original_interpolation(x, y, new_x)
r2 = fast_interpolation(x, y, new_x)
>>> np.allclose(r1, r2)
True
%timeit original_interpolation(x, y, new_x)
%timeit fast_interpolation(x, y, new_x)
1 loops, best of 3: 3.78 s per loop
100 loops, best of 3: 15.4 ms per loop
I don't think interp1d has a method for doing this fast, so you can't avoid the loop here.
Cython you can probably still avoid by coding up the linear interpolation using np.searchsorted, something like this (not tested):
def interp3d(x, y, new_x):
assert x.ndim == 1 and y.ndim == 3 and new_x.ndim == 2
assert y.shape[:2] == new_x.shape and x.shape == y.shape[2:]
nx, ny = y.shape[:2]
new_x = new_x.ravel()
j = np.arange(len(new_x))
k = np.searchsorted(x, new_x).clip(1, len(x) - 1)
y = y.reshape(-1, x.shape[0])
p = (new_x - x[k-1]) / (x[k] - x[k-1])
result = (1 - p) * y[j,k-1] + p * y[j,k]
return result.reshape(nx, ny)
Doesn't help with cubic interpolation, though.
EDIT: made it a function and fixed off-by-one errors. Some timings vs. Cython (500x500x500 grid):
In [58]: %timeit interp3d(x, y, new_x)
10 loops, best of 3: 82.7 ms per loop
In [59]: %timeit cyfile.interp3d(x, y, new_x)
10 loops, best of 3: 86.3 ms per loop
In [60]: abs(interp3d(x, y, new_x) - cyfile.interp3d(x, y, new_x)).max()
Out[60]: 2.2204460492503131e-16
Though, one can argue that the Cython code is easier to read.
As the numpy suggestion above was taking too long, I could wait so here's the cython version for future reference. From some loose benchmarks it is about 3000 times faster (granted, it is only linear interpolation and doesn't to as much as interp1d but it's ok for this purpose).
import numpy as N
cimport numpy as N
cimport cython
DTYPEf = N.float64
ctypedef N.float64_t DTYPEf_t
#cython.boundscheck(False) # turn of bounds-checking for entire function
#cython.wraparound(False) # turn of bounds-checking for entire function
cpdef interp3d(N.ndarray[DTYPEf_t, ndim=1] x, N.ndarray[DTYPEf_t, ndim=3] y,
N.ndarray[DTYPEf_t, ndim=2] new_x):
"""
interp3d(x, y, new_x)
Performs linear interpolation over the last dimension of a 3D array,
according to new values from a 2D array new_x. Thus, interpolate
y[i, j, :] for new_x[i, j].
Parameters
----------
x : 1-D ndarray (double type)
Array containg the x (abcissa) values. Must be monotonically
increasing.
y : 3-D ndarray (double type)
Array containing the y values to interpolate.
x_new: 2-D ndarray (double type)
Array with new abcissas to interpolate.
Returns
-------
new_y : 3-D ndarray
Interpolated values.
"""
cdef int nx = y.shape[0]
cdef int ny = y.shape[1]
cdef int nz = y.shape[2]
cdef int i, j, k
cdef N.ndarray[DTYPEf_t, ndim=2] new_y = N.zeros((nx, ny), dtype=DTYPEf)
for i in range(nx):
for j in range(ny):
for k in range(1, nz):
if x[k] > new_x[i, j]:
new_y[i, j] = (y[i, j, k] - y[i, j, k - 1]) * \
(new_x[i, j] - x[k-1]) / (x[k] - x[k - 1]) + y[i, j, k - 1]
break
return new_y
Building on #pv.'s answer, and vectorising the inner loop, the following gives a substantial speedup (EDIT: changed the expensive numpy.tile to using numpy.lib.stride_tricks.as_strided):
import numpy
from scipy import interpolate
nx = 30
ny = 40
nz = 50
y = numpy.random.randn(nx, ny, nz)
x = numpy.float64(numpy.arange(0, nz))
# We select some locations in the range [0.1, nz-0.1]
new_z = numpy.random.random_integers(1, (nz-1)*10, size=(nx, ny))/10.0
# y is a 3D ndarray
# x is a 1D ndarray with the abcissa values
# new_z is a 2D array
def original_interpolation():
result = numpy.empty(y.shape[:-1])
for i in range(nx):
for j in range(ny):
f = interpolate.interp1d(x, y[i, j], axis=-1, kind='linear')
result[i, j] = f(new_z[i, j])
return result
grid_x, grid_y = numpy.mgrid[0:nx, 0:ny]
def faster_interpolation():
flat_new_z = new_z.ravel()
k = numpy.searchsorted(x, flat_new_z)
k = k.reshape(nx, ny)
lower_index = [grid_x, grid_y, k-1]
upper_index = [grid_x, grid_y, k]
tiled_x = numpy.lib.stride_tricks.as_strided(x, shape=(nx, ny, nz),
strides=(0, 0, x.itemsize))
z_upper = tiled_x[upper_index]
z_lower = tiled_x[lower_index]
z_step = z_upper - z_lower
z_delta = new_z - z_lower
y_lower = y[lower_index]
result = y_lower + z_delta * (y[upper_index] - y_lower)/z_step
return result
# both should be the same (giving a small difference)
print numpy.max(
numpy.abs(original_interpolation() - faster_interpolation()))
That gives the following times on my machine:
In [8]: timeit foo.original_interpolation()
10 loops, best of 3: 102 ms per loop
In [9]: timeit foo.faster_interpolation()
1000 loops, best of 3: 564 us per loop
Going to nx = 300, ny = 300 and nz = 500, gives a 130x speedup:
In [2]: timeit original_interpolation()
1 loops, best of 3: 8.27 s per loop
In [3]: timeit faster_interpolation()
10 loops, best of 3: 60.1 ms per loop
You'd need a write your own algorithm for cubic interpolation, but it shouldn't be so hard.
You could use map_coordinates for that:
from numpy import random, meshgrid, arange
from scipy.ndimage import map_coordinates
(nx, ny, nz) = (4, 5, 6)
# some random array
A = random.rand(nx, ny, nz)
# random floating-point indices in [0, nz-1]
Z = random.rand(nx, ny)*(nz-1)
# regular integer indices of shape (nx,ny)
X, Y = meshgrid(arange(nx), arange(ny), indexing='ij')
coords = (X, Y, Z) # X, Y, and Z are of shape (nx, ny)
print map_coordinates(A, coords, order=1, cval=-999.)
Although there are several nice answers,
they're still doing 250k interpolations in a fixed 500-long array:
j250k = np.searchsorted( X500, X250k ) # indices in [0, 500)
This can be sped up with a LUT, LookUp Table, with say 5k slots:
lut = np.interp( np.arange(5000), X500, np.arange(500) ).round().astype(int)
xscale = (X - X.min()) * (5000 - 1) \
/ (X.max() - X.min())
j = lut.take( xscale.astype(int), mode="clip" ) # take(floats) in numpy 1.7 ?
#---------------------------------------------------------------------------
# X | | | | |
# j 0 1 2 3 4 ...
# LUT |....|.......|.|.............|.... -> int j (+ offset in [0, 1) )
#---------------------------------------------------------------------------
searchsorted is pretty fast, time ~ ln2 500,
so this is probably not much faster.
But LUTs are very fast in C, a simple speed / memory tradeoff.
Related
I want to find all the numbers that are not in a list and are equal to the following formula : x + x_max * y + x_max * y_max * z, where (x_max, y_max) are parameters and (x, y, z) can vary in a restricted set. I also want to know the values of (x, y, z) that I need to obtain each number.
I have succeeded to do this with the following code but it is currently extremely slow due to the three-nested loops. I am looking for a faster solution (maybe using NumPy arrays?).
What my code is actually doing with an extremely low speed (but works):
import numpy as np
x_max, y_max, z_max = 180, 90, 90
numbers = np.random.randint(x_max * y_max * z_max, size=1000)
results = np.array([[0, 0, 0, 0]]) # Initializes the array
for x in range(0, x_max):
for y in range(0, y_max):
for z in range(0, z_max):
result = x + y * x_max + z * x_max * y_max
if (result not in numbers):
results = np.append(results, [[x, y, z, result]], axis=0)
# Initial line is useless
results = np.delete(results, (0), axis=0)
Your nested loop and the calcuation:
for x in range(0, x_max):
for y in range(0, y_max):
for z in range(0, z_max):
result = x + y * x_max + z * x_max * y_max
simply calculate all integers between 0 and x_max * y_max * z_max. And all the integers are unique as well: no integer is calculated twice.
That fact makes this a lot easier:
values = np.arange(x_max * y_max * z_max)
results = np.setdiff1d(values, numbers)
This will give you all the integers that have been calculated and are not in the numbers exclusion list.
Now you only miss the input x, y and z values. These, though, can be calculated from the actual results with some straightforward modulo arithmetic:
z = results // (x_max * y_max)
rem = results % (x_max * y_max)
y = rem // x_max
x = rem % x_max
Now you can stack it all nicely together:
results = np.array([x, y, z, results])
You can tweak your results array if needs be, e.g.:
results = results.T # simple transpose
results = np.sort(results, axis=1) # sort the inner list
I used the above to compare the output from this calculation with that of the triple-nested loop calculation. The results were indeed equal.
Ok, so first of all, the main reason your code is slow is not the nested looping, it's because you're using np.append - this allocates an entirely new array with each append, and should almost never be used. You're much better off just using a python list, for which appending is an O(1) operation (internally, it only actually reallocates memory when the list grows past some multiple (I think something like 1+1/e times) its previous size).
The following should run somewhere on the order of 100x faster.
import numpy as np
x_max, y_max, z_max = 180, 90, 90
numbers = np.random.randint(x_max * y_max * z_max, size=1000)
# results = np.array([[0, 0, 0, 0]]) # Initializes the array <-- Unnecessary
results = [] # <-- Use a python list when you want to expand things
for x in range(0, x_max):
print(f'x={x}')
for y in range(0, y_max):
for z in range(0, z_max):
result = x + y * x_max + z * x_max * y_max
if (result not in numbers):
# results = np.append(results, [[x, y, z, result]], axis=0) # <-- np.append is very slow O(N)
results.append([x, y, z, result]) # <-- List.append is O(1)
results = np.array(results)
# Initial line is useless
# results = np.delete(results, (0), axis=0) <-- Unnecessary without the unnecessary initialization.
... but, we can still get faster using numpy vectorization.
# ... See above code for computation of "results"
print(f'Found result with python loop and list in {time.time() - t_start:.3f}s')
t_start = time.time()
# Get the x, y, z indices that you'd normally get from a nested loop. They'll be in arrays of shape (x_max, y_max, z_max)
xs, ys, zs = np.meshgrid(np.arange(x_max), np.arange(y_max), np.arange(z_max), indexing='ij')
all_values = xs + ys * x_max + zs * x_max * y_max
valid_indices = ~np.isin(all_values, numbers) # Get a shape (x_max, y_max, z_max) boolean mask
# Now use the mask to filter each array (yielding a flat (x_max*y_max*zmax) v[valid_indices] array)
# ... Then reshape it into a (x_max*y_max*zmax, 1) array
# ... So it can be stacked horizontally (h-stack) with the others along the second axis.
results_vectorized = np.hstack([v[valid_indices].reshape(-1, 1) for v in (xs, ys, zs, all_values)])
assert np.array_equal(results_vectorized, results)
print(f'Found result in {time.time() - t_start:.3f}s')
This is around 20x faster than the previous:
Found result with python loop and list in 3.630s
Found result in 0.154s
Speeding things up with numpy is always the same process:
Construct a big array containing all the information
Do computations on the whole array at the same time
Here I use np.fromfunction to construct the big array from the formula you gave.
So here is my ~60x speedup solution:
import numpy as np
from functools import partial
def formula(x, y, z, x_max, y_max, z_max):
return x + y * x_max + z * x_max * y_max
def my_solution(x_max, y_max, z_max, seen):
seen = np.unique(seen)
results = np.fromfunction(
partial(formula, x_max=x_max, y_max=y_max, z_max=z_max),
shape=(x_max, y_max, z_max),
)
mask_not_seen = ~np.isin(results, seen)
results_not_seen = results[mask_not_seen]
indices_not_seen = np.where(mask_not_seen)
return np.stack([*indices_not_seen, results_not_seen], axis=-1)
Let's check that it outputs the same as your solution:
x_max, y_max, z_max = 18, 9, 9
seen = np.random.randint(x_max * y_max * z_max, size=100)
op_out = op_solution(x_max, y_max, z_max, seen)
my_out = my_solution(x_max, y_max, z_max, seen)
assert np.all(op_out == my_out)
and that it is indeed quicker (~60x):
...: %timeit op_solution(x_max, y_max, z_max, seen)
...: %timeit my_solution(x_max, y_max, z_max, seen)
9.74 ms ± 37.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
161 µs ± 2.36 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
Finally with the values you gave:
...: seen = np.random.randint(x_max * y_max * z_max, size=1000)
...: %timeit my_solution(x_max=180, y_max=90, z_max=90, seen=seen)
242 ms ± 3.81 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
I'm able to use numpy.polynomial to fit terms to 1D polynomials like f(x) = 1 + x + x^2. How can I fit multidimensional polynomials, like f(x,y) = 1 + x + x^2 + y + yx + y x^2 + y^2 + y^2 x + y^2 x^2? It looks like numpy doesn't support multidimensional polynomials at all: is that the case? In my real application, I have 5 dimensions of input and I am interested in hermite polynomials. It looks like the polynomials in scipy.special are also only available for one dimension of inputs.
# One dimension of data can be fit
x = np.random.random(100)
y = np.sin(x)
params = np.polynomial.polynomial.polyfit(x, y, 6)
np.polynomial.polynomial.polyval([0, .2, .5, 1.5], params)
array([ -5.01799432e-08, 1.98669317e-01, 4.79425535e-01,
9.97606096e-01])
# When I try two dimensions, it fails.
x = np.random.random((100, 2))
y = np.sin(5 * x[:,0]) + .4 * np.sin(x[:,1])
params = np.polynomial.polynomial.polyvander2d(x, y, [6, 6])
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-13-5409f9a3e632> in <module>()
----> 1 params = np.polynomial.polynomial.polyvander2d(x, y, [6, 6])
/usr/local/lib/python2.7/site-packages/numpy/polynomial/polynomial.pyc in polyvander2d(x, y, deg)
1201 raise ValueError("degrees must be non-negative integers")
1202 degx, degy = ideg
-> 1203 x, y = np.array((x, y), copy=0) + 0.0
1204
1205 vx = polyvander(x, degx)
ValueError: could not broadcast input array from shape (100,2) into shape (100)
I got annoyed that there is no simple function for a 2d polynomial fit of any number of degrees so I made my own. Like the other answers it uses numpy lstsq to find the best coefficients.
import numpy as np
from scipy.linalg import lstsq
from scipy.special import binom
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
def _get_coeff_idx(coeff):
idx = np.indices(coeff.shape)
idx = idx.T.swapaxes(0, 1).reshape((-1, 2))
return idx
def _scale(x, y):
# Normalize x and y to avoid huge numbers
# Mean 0, Variation 1
offset_x, offset_y = np.mean(x), np.mean(y)
norm_x, norm_y = np.std(x), np.std(y)
x = (x - offset_x) / norm_x
y = (y - offset_y) / norm_y
return x, y, (norm_x, norm_y), (offset_x, offset_y)
def _unscale(x, y, norm, offset):
x = x * norm[0] + offset[0]
y = y * norm[1] + offset[1]
return x, y
def polyvander2d(x, y, degree):
A = np.polynomial.polynomial.polyvander2d(x, y, degree)
return A
def polyscale2d(coeff, scale_x, scale_y, copy=True):
if copy:
coeff = np.copy(coeff)
idx = _get_coeff_idx(coeff)
for k, (i, j) in enumerate(idx):
coeff[i, j] /= scale_x ** i * scale_y ** j
return coeff
def polyshift2d(coeff, offset_x, offset_y, copy=True):
if copy:
coeff = np.copy(coeff)
idx = _get_coeff_idx(coeff)
# Copy coeff because it changes during the loop
coeff2 = np.copy(coeff)
for k, m in idx:
not_the_same = ~((idx[:, 0] == k) & (idx[:, 1] == m))
above = (idx[:, 0] >= k) & (idx[:, 1] >= m) & not_the_same
for i, j in idx[above]:
b = binom(i, k) * binom(j, m)
sign = (-1) ** ((i - k) + (j - m))
offset = offset_x ** (i - k) * offset_y ** (j - m)
coeff[k, m] += sign * b * coeff2[i, j] * offset
return coeff
def plot2d(x, y, z, coeff):
# regular grid covering the domain of the data
if x.size > 500:
choice = np.random.choice(x.size, size=500, replace=False)
else:
choice = slice(None, None, None)
x, y, z = x[choice], y[choice], z[choice]
X, Y = np.meshgrid(
np.linspace(np.min(x), np.max(x), 20), np.linspace(np.min(y), np.max(y), 20)
)
Z = np.polynomial.polynomial.polyval2d(X, Y, coeff)
fig = plt.figure()
ax = fig.gca(projection="3d")
ax.plot_surface(X, Y, Z, rstride=1, cstride=1, alpha=0.2)
ax.scatter(x, y, z, c="r", s=50)
plt.xlabel("X")
plt.ylabel("Y")
ax.set_zlabel("Z")
plt.show()
def polyfit2d(x, y, z, degree=1, max_degree=None, scale=True, plot=False):
"""A simple 2D polynomial fit to data x, y, z
The polynomial can be evaluated with numpy.polynomial.polynomial.polyval2d
Parameters
----------
x : array[n]
x coordinates
y : array[n]
y coordinates
z : array[n]
data values
degree : {int, 2-tuple}, optional
degree of the polynomial fit in x and y direction (default: 1)
max_degree : {int, None}, optional
if given the maximum combined degree of the coefficients is limited to this value
scale : bool, optional
Wether to scale the input arrays x and y to mean 0 and variance 1, to avoid numerical overflows.
Especially useful at higher degrees. (default: True)
plot : bool, optional
wether to plot the fitted surface and data (slow) (default: False)
Returns
-------
coeff : array[degree+1, degree+1]
the polynomial coefficients in numpy 2d format, i.e. coeff[i, j] for x**i * y**j
"""
# Flatten input
x = np.asarray(x).ravel()
y = np.asarray(y).ravel()
z = np.asarray(z).ravel()
# Remove masked values
mask = ~(np.ma.getmask(z) | np.ma.getmask(x) | np.ma.getmask(y))
x, y, z = x[mask].ravel(), y[mask].ravel(), z[mask].ravel()
# Scale coordinates to smaller values to avoid numerical problems at larger degrees
if scale:
x, y, norm, offset = _scale(x, y)
if np.isscalar(degree):
degree = (int(degree), int(degree))
degree = [int(degree[0]), int(degree[1])]
coeff = np.zeros((degree[0] + 1, degree[1] + 1))
idx = _get_coeff_idx(coeff)
# Calculate elements 1, x, y, x*y, x**2, y**2, ...
A = polyvander2d(x, y, degree)
# We only want the combinations with maximum order COMBINED power
if max_degree is not None:
mask = idx[:, 0] + idx[:, 1] <= int(max_degree)
idx = idx[mask]
A = A[:, mask]
# Do the actual least squares fit
C, *_ = lstsq(A, z)
# Reorder coefficients into numpy compatible 2d array
for k, (i, j) in enumerate(idx):
coeff[i, j] = C[k]
# Reverse the scaling
if scale:
coeff = polyscale2d(coeff, *norm, copy=False)
coeff = polyshift2d(coeff, *offset, copy=False)
if plot:
if scale:
x, y = _unscale(x, y, norm, offset)
plot2d(x, y, z, coeff)
return coeff
if __name__ == "__main__":
n = 100
x, y = np.meshgrid(np.arange(n), np.arange(n))
z = x ** 2 + y ** 2
c = polyfit2d(x, y, z, degree=2, plot=True)
print(c)
It doesn't look like polyfit supports fitting multivariate polynomials, but you can do it by hand, with linalg.lstsq. The steps are as follows:
Gather the degrees of monomials x**i * y**j you wish to use in the model. Think carefully about it: your current model already has 9 parameters, if you are going to push to 5 variables then with the current approach you'll end up with 3**5 = 243 parameters, a sure road to overfitting. Maybe limit to the monomials of __total_ degree at most 2 or three...
Plug the x-points into each monomial; this gives a 1D array. Stack all such arrays as columns of a matrix.
Solve a linear system with aforementioned matrix and with the right-hand side being the target values (I call them z because y is confusing when you also use x, y for two variables).
Here it is:
import numpy as np
x = np.random.random((100, 2))
z = np.sin(5 * x[:,0]) + .4 * np.sin(x[:,1])
degrees = [(i, j) for i in range(3) for j in range(3)] # list of monomials x**i * y**j to use
matrix = np.stack([np.prod(x**d, axis=1) for d in degrees], axis=-1) # stack monomials like columns
coeff = np.linalg.lstsq(matrix, z)[0] # lstsq returns some additional info we ignore
print("Coefficients", coeff) # in the same order as the monomials listed in "degrees"
fit = np.dot(matrix, coeff)
print("Fitted values", fit)
print("Original values", y)
I believe you have misunderstood what polyvander2d does and how it should be used. polyvander2d() returns the pseudo-Vandermonde matrix of degrees deg and sample points (x, y).
Here, y is not the value(s) of the polynomial at point(s) x but rather it is the y-coordinate of the point(s) and x is the x-coordinate. Roughly speaking, the returned array is a set of combinations of (x**i) * (y**j) and x and y are essentially 2D "mesh-grids". Therefore, both x and y must have identical shapes.
Your x and y, however, arrays have different shapes:
>>> x.shape
(100, 2)
>>> y.shape
(100,)
I do not believe numpy has a 5D-polyvander of the form polyvander5D(x, y, z, v, w, deg). Notice, all the variables here are coordinates and not the values of the polynomial p=p(x,y,z,v,w). You, however, seem to be using y (in the 2D case) as f.
It appears that numpy does not have 2D or higher equivalents for the polyfit() function. If your intention is to find the coefficients of the best-fitting polynomial in higher-dimensions, I would suggest that you generalize the approach described here: Equivalent of `polyfit` for a 2D polynomial in Python
The option isn't there because nobody wants to do that. Combine the polynomials linearly (f(x,y) = 1 + x + y + x^2 + y^2) and solve the system of equations yourself.
I implemented a support vector machine in python using the cvxopt qp solver where I need to compute a gram matrix of two vectors with a kernel function at each element. I implemented it correctly using for loops but this strategy is computationally intensive. I would like to vectorize the code.
Example:
Here is what I have written:
K = np.array( [kernel(X[i], X[j],poly=poly_kernel)
for j in range(m)
for i in range(m)]).reshape((m, m))
How can I vectorize the above code without for loops to achieve the same result faster?
The kernel function computes a gaussian kernel.
Here is a quick explanation of an svm with kernel trick. Second page of this explains the problem.
Here is my full code for context.
EDIT: Here is a quick code snippet that runs what I need to vectorized in an unvectorized form
from sklearn.datasets import make_gaussian_quantiles;
import numpy as np;
X,y = make_gaussian_quantiles(mean=None, cov=1.0, n_samples=100, n_features=2, n_classes=2, shuffle=True, random_state=5);
m = X.shape[0];
def kernel(a,b,d=20,poly=True,sigma=0.5):
if (poly):
return np.inner(a,b) ** d;
else:
return np.exp(-np.linalg.norm((a - b) ** 2)/sigma**2)
# Need to vectorize these loops
K = np.array([kernel(X[i], X[j],poly=False)
for j in range(m)
for i in range(m)]).reshape((m, m))
Thanks!
Here is a vectorized version. The non poly branch comes in two variants a direct one and a memory saving one in case the number of features is large:
from sklearn.datasets import make_gaussian_quantiles;
import numpy as np;
X,y = make_gaussian_quantiles(mean=None, cov=1.0, n_samples=100, n_features=2, n_classes=2, shuffle=True, random_state=5);
Y,_ = make_gaussian_quantiles(mean=None, cov=1.0, n_samples=200, n_features=2, n_classes=2, shuffle=True, random_state=2);
m = X.shape[0];
n = Y.shape[0]
def kernel(a,b,d=20,poly=True,sigma=0.5):
if (poly):
return np.inner(a,b) ** d;
else:
return np.exp(-np.linalg.norm((a - b) ** 2)/sigma**2)
# Need to vectorize these loops
POLY = False
LOW_MEM = 0
K = np.array([kernel(X[i], Y[j], poly=POLY)
for i in range(m)
for j in range(n)]).reshape((m, n))
def kernel_v(X, Y=None, d=20, poly=True, sigma=0.5):
Z = X if Y is None else Y
if poly:
return np.einsum('ik,jk', X, Z)**d
elif X.shape[1] < LOW_MEM:
return np.exp(-np.sqrt(((X[:, None, :] - Z[None, :, :])**4).sum(axis=-1)) / sigma**2)
elif Y is None or Y is X:
X2 = X*X
H = np.einsum('ij,ij->i', X2, X2) + np.einsum('ik,jk', X2, 3*X2) - np.einsum('ik,jk', X2*X, 4*X)
return np.exp(-np.sqrt(np.maximum(0, H+H.T)) / sigma**2)
else:
X2, Y2 = X*X, Y*Y
E = np.einsum('ik,jk', X2, 6*Y2) - np.einsum('ik,jk', X2*X, 4*Y) - np.einsum('ik,jk', X, 4*Y2*Y)
E += np.add.outer(np.einsum('ij,ij->i', X2, X2), np.einsum('ij,ij->i', Y2, Y2))
return np.exp(-np.sqrt(np.maximum(0, E)) / sigma**2)
print(np.allclose(K, kernel_v(X, Y, poly=POLY)))
My Setup: Python 2.7.4.1, Numpy MKL 1.7.1, Windows 7 x64, WinPython
Context:
I tried to implement the Sequential Minimal Optimization algorithm for solving SVM. I use maximal violating pair approach.
The problem:
In working set selection procedure i want to find maximum value of gradient and its index for elements which met some condition, y[i]*alpha[i]<0 or y[i]*alpha[i]
#y - array of -1 and 1
y=np.array([-1,1,1,1,-1,1])
#alpha- array of floats in range [0,C]
alpha=np.array([0.4,0.1,1.33,0,0.9,0])
#grad - array of floats
grad=np.array([-1,-1,-0.2,-0.4,0.4,0.2])
GMaxI=float('-inf')
GMax_idx=-1
n=alpha.shape[0] #usually n=100000
C=4
B=[0,0,C]
for i in xrange(0,n):
yi=y[i] #-1 or 1
alpha_i=alpha[i]
if (yi * alpha_i< B[yi+1]): # B[-1+1]=0 B[1+1]=C
if( -yi*grad[i]>=GMaxI):
GMaxI= -yi*grad[i]
GMax_idx = i
This procedure is called many times (~50000) and profiler shows that this is the bottleneck.
It is possible to vectorize this code?
Edit 1:
Add some small exemplary data
Edit 2:
I have checked solution proposed by hwlau , larsmans and Mr E. Only solutions proposed Mr E is correct. Below sample code with all three answers:
import numpy as np
y=np.array([ -1, -1, -1, -1, -1, -1, -1, -1])
alpha=np.array([0, 0.9, 0.4, 0.1, 1.33, 0, 0.9, 0])
grad=np.array([-3, -0.5, -1, -1, -0.2, -4, -0.4, -0.3])
C=4
B=np.array([0,0,C])
#hwlau - wrong index and value
filter = (y*alpha < C*0.5*(y+1)).astype('float')
GMax_idx = (filter*(-y*grad)).argmax()
GMax = -y[GMax_idx]*grad[GMax_idx]
print GMax_idx,GMax
#larsmans - wrong index
neg_y_grad = (-y * grad)[y * alpha < B[y + 1]]
GMaxI = np.max(neg_y_grad)
GMax_ind = np.argmax(neg_y_grad)
print GMax_ind,GMaxI
#Mr E - correct result
BY = np.take(B, y+1)
valid_mask = (y * alpha < BY)
values = -y * grad
values[~valid_mask] = np.min(values) - 1.0
GMaxI = values.max()
GMax_idx = values.argmax()
print GMax_idx,GMaxI
Output (GMax_idx, GMaxI)
0 -3.0
3 -0.2
4 -0.2
Conclusions
After checking all solutions, the fastest one (2x-6x) is solution proposed by #ali_m. However it requires to install some python packages: numba and all its prerequisites.
I have some trouble to use numba with class methods, so I create global functions which are autojited with numba, my solution look something like this:
from numba import autojit
#autojit
def FindMaxMinGrad(A,B,alpha,grad,y):
'''
Finds i,j indices with maximal violatin pair scheme
A,B - 3 dim arrays, contains bounds A=[-C,0,0], B=[0,0,C]
alpha - array like, contains alpha coeficients
grad - array like, gradient
y - array like, labels
'''
GMaxI=-100000
GMaxJ=-100000
GMax_idx=-1
GMin_idx=-1
for i in range(0,alpha.shape[0]):
if (y[i] * alpha[i]< B[y[i]+1]):
if( -y[i]*grad[i]>GMaxI):
GMaxI= -y[i]*grad[i]
GMax_idx = i
if (y[i] * alpha[i]> A[y[i]+1]):
if( y[i]*grad[i]>GMaxJ):
GMaxJ= y[i]*grad[i]
GMin_idx = i
return (GMaxI,GMaxJ,GMax_idx,GMin_idx)
class SVM(object):
def working_set(self,....):
FindMaxMinGrad(.....)
You can probably do quite a lot better than plain vectorization if you use numba to JIT-compile your original code that used nested loops.
import numpy as np
from numba import autojit
#autojit
def jit_max_grad(y, alpha, grad, B):
maxgrad = -inf
maxind = -1
for ii in xrange(alpha.shape[0]):
if (y[ii] * alpha[ii] < B[y[ii] + 1]):
g = -y[ii] * grad[ii]
if g >= maxgrad:
maxgrad = g
maxind = ii
return maxind, maxgrad
For comparison, here's Mr E's vectorized version:
def mr_e_max_grad(y, alpha, grad, B):
BY = np.take(B, y+1)
valid_mask = (y * alpha < BY)
values = -y * grad
values[~valid_mask] = np.min(values) - 1.0
GMaxI = values.max()
GMax_idx = values.argmax()
return GMax_idx, GMaxI
Timing:
y = np.array([ -1, -1, -1, -1, -1, -1, -1, -1])
alpha = np.array([0, 0.9, 0.4, 0.1, 1.33, 0, 0.9, 0])
grad = np.array([-3, -0.5, -1, -1, -0.2, -4, -0.4, -0.3])
C = 4
B = np.array([0,0,C])
%timeit mr_e_max_grad(y, alpha, grad, B)
# 100000 loops, best of 3: 19.1 µs per loop
%timeit jit_max_grad(y, alpha, grad, B)
# 1000000 loops, best of 3: 1.07 µs per loop
Update: if you want to see what the timings look like on bigger arrays, it's easy to define a function that generates semi-realistic fake data based on your description in the question:
def make_fake(n, C=4):
y = np.random.choice((-1, 1), n)
alpha = np.random.rand(n) * C
grad = np.random.randn(n)
B = np.array([0,0,C])
return y, alpha, grad, B
%%timeit y, alpha, grad, B = make_fake(100000, 4)
mr_e_max_grad(y, alpha, grad, B)
# 1000 loops, best of 3: 1.83 ms per loop
%%timeit y, alpha, grad, B = make_fake(100000, 4)
jit_max_grad(y, alpha, grad, B)
# 1000 loops, best of 3: 471 µs per loop
I think this is a fully vectorized version
import numpy as np
#y - array of -1 and 1
y=np.array([-1,1,1,1,-1,1])
#alpha- array of floats in range [0,C]
alpha=np.array([0.4,0.1,1.33,0,0.9,0])
#grad - array of floats
grad=np.array([-1,-1,-0.2,-0.4,0.4,0.2])
BY = np.take(B, y+1)
valid_mask = (y * alpha < BY)
values = -yi * grad
values[~valid_mask] = np.min(values) - 1.0
GMaxI = values.max()
GMax_idx = values.argmax()
Here you go:
y=np.array([-1,1,1,1,-1,1])
alpha=np.array([0.4,0.1,1.33,0,0.9,0])
grad=np.array([-1,-1,-0.2,-0.4,0.4,0.2])
C=4
filter = (y*alpha < C*0.5*(y+1)).astype('float')
GMax_idx = (filter*(-y*grad)).argmax()
GMax = -y[GMax_idx]*grad[GMax_idx]
No benchmark tried, but it is pure numerical and vectorized so it should be fast.
If you change B from a list to a NumPy array, you can at least vectorize the yi * alpha_i< B[yi+1] and push the loop inwards:
GMaxI = float('-inf')
GMax_idx = -1
for i in np.where(y * alpha < B[y + 1])[0]:
if -y[i] * grad[i] >= GMaxI:
GMaxI= -y[i] * grad[i]
GMax_idx = i
That should save a bit of time. Next up, you can vectorize -y[i] * grad[i]:
GMaxI = float('-inf')
GMax_idx = -1
neg_y_grad = -y * grad
for i in np.where(y * alpha < B[y + 1])[0]:
if neg_y_grad[i] >= GMaxI:
GMaxI= -y[i] * grad[i]
GMax_idx = i
Finally, we can vectorize away the entire loop by using max and argmax on -y * grad, filtered by y * alpha < B[y + 1]:
neg_y_grad = (-y * grad)
GMaxI = np.max(neg_y_grad[y * alpha < B[y + 1]])
GMax_idx = np.where(neg_y_grad == GMaxI)[0][0]
I feel like there should be a quick way of speeding up this code. I think the answer is here, but I cannot seem to get my problem in that format. The underlying problem that I am attempting to solve is find the point wise difference in terms of the parallel and perpendicular components and create a 2d histogram of these differences.
out = np.zeros((len(rpbins)-1,len(pibins)-1))
tmp = np.zeros((len(x),2))
for i in xrange(len(x)):
tmp[:,0] = x - x[i]
tmp[:,1] = y - y[i]
para = np.sum(tmp**2,axis=-1)**(1./2)
perp = np.abs(z - z[i])
H, _, _ = np.histogram2d(para, perp, bins=[rpbins, pibins])
out += H
Vectorizing things like this is tricky, because to get rid of a loop over n elements you have to construct an array of (n, n), so for large inputs you are likely to get a worse performance than with a Python loop. But it can be done:
mask = np.triu_indices(x.shape[0], 1)
para = np.sqrt((x[:, None] - x)**2 + (y[:, None] - y)**2)
perp = np.abs(z[:, None] - z)
hist, _, _ = np.histogram2d(para[mask], perp[mask], bins=[rpbins, pibins])
The mask is to avoid counting each distance twice. I have also set the diagonal offset to 1 to avoid including the 0 distances of each point to itself in the histogram. But if you don't index para and perp with it, you get the exact same result as your code.
With this sample data:
items = 100
rpbins, pibins = np.linspace(0, 1, 3), np.linspace(0, 1, 3)
x = np.random.rand(items)
y = np.random.rand(items)
z = np.random.rand(items)
I get this for my hist and your out:
>>> hist
array([[ 1795., 651.],
[ 1632., 740.]])
>>> out
array([[ 3690., 1302.],
[ 3264., 1480.]])
and out[i, j] = 2 * hist[i, j] except for i = j = 0, where out[0, 0] = 2 * hist[0, 0] + items because of the 0 distance of each item to itself.
EDIT Tried the following after tcaswell's comment:
items = 1000
rpbins, pibins = np.linspace(0, 1, 3), np.linspace(0, 1, 3)
x, y, z = np.random.rand(3, items)
def hist1(x, y, z, rpbins, pibins) :
mask = np.triu_indices(x.shape[0], 1)
para = np.sqrt((x[:, None] - x)**2 + (y[:, None] - y)**2)
perp = np.abs(z[:, None] - z)
hist, _, _ = np.histogram2d(para[mask], perp[mask], bins=[rpbins, pibins])
return hist
def hist2(x, y, z, rpbins, pibins) :
mask = np.triu_indices(x.shape[0], 1)
para = np.sqrt((x[:, None] - x)[mask]**2 + (y[:, None] - y)[mask]**2)
perp = np.abs((z[:, None] - z)[mask])
hist, _, _ = np.histogram2d(para, perp, bins=[rpbins, pibins])
return hist
def hist3(x, y, z, rpbins, pibins) :
mask = np.triu_indices(x.shape[0], 1)
para = np.sqrt(((x[:, None] - x)**2 + (y[:, None] - y)**2)[mask])
perp = np.abs((z[:, None] - z)[mask])
hist, _, _ = np.histogram2d(para, perp, bins=[rpbins, pibins])
return hist
In [10]: %timeit -n1 -r10 hist1(x, y, z, rpbins, pibins)
1 loops, best of 10: 289 ms per loop
In [11]: %timeit -n1 -r10 hist2(x, y, z, rpbins, pibins)
1 loops, best of 10: 294 ms per loop
In [12]: %timeit -n1 -r10 hist3(x, y, z, rpbins, pibins)
1 loops, best of 10: 278 ms per loop
It seems that most of the time is spent instantiating new arrays, not doing actual computations, so while there is some efficiency to scrape off, there really isn't much.