Related
See the code. I am trying to simulate an array called 'xx'. Following the code, I am simulating each component in the matrix one by one. This is super time-consuming when M and N are very large. The reason why I do this is because in the function 'func', for each column data, the 'samplesize 'is different. Is there any faster way to generate this numpy array please? Thank you.
import numpy as np
def func ( x , n):
samplesize = np.random.poisson(x)
return np.random.uniform(0, 2, samplesize)
def singlerow(x, n ,N):
rowdata = np.zeros(N)
rowdata[0] = 0
for i in range(1, N):
rowdata[i] = rowdata[i-1] + np.sum(func(x, n))
return rowdata
def simulatematrix( x,n ,M, N):
result = np.zeros((M,N))
for m in range(M):
result[m] = singlerow(x,n, N)
return result
M = 100
N = 10
xx = simulatematrix(40, 10, M, N)
I have a (51266,20,25,3) (N,F,J,C) matrix, where N is the example number, F is the frame number, J is the joint, and C is the xyz coordinates of the joint. I want to calculate the euclidean distance matrix for each frame in each example to have a matrix of dimensions (51266,20,25,25) My code is
from sklearn.metrics.pairwise import euclidean_distances as euc
from tqdm import tqdm
import numpy as np
Examples = np.load('allExamples.npy')
theEuclideanMethod = np.zeros((0,20,25,25))
for example in tqdm(range(Examples.shape[0])):
euclideanBox = np.zeros((0,25,25))
for frame in range(20):
euclideanBox = np.concatenate((euclideanBox,euc(Examples[example,frame,:,:])[np.newaxis,...]),axis=0)
euclideanBox = euclideanBox[np.newaxis,...]
theEuclideanMethod = np.concatenate((theEuclideanMethod,euclideanBox))
np.save("Euclidean examples.npy",theEuclideanMethod)
print(theEuclideanMethod.shape,"Euclidean shape")
The problem is I'm using for loops which are super slow. What are other ways I can modify my code to run faster ?
This should run pretty fast. Float32 used to keep the memory usage low, but is optional. Adjust batch_size to be greater for increased speed or lower for less memory usage.
import numpy as np
# Adjust batch_size depending on your memory
batch_size = 500
# Make some fake data
x = np.random.randn(51266,20,25,3).astype(np.float32)
y = np.random.randn(51266,20,25,3).astype(np.float32)
# distance_matrix
d = np.empty(x.shape[:-1] + (x.shape[-2],), dtype=np.float32)
# Number of batches
N = (x.shape[0]-1) // batch_size + 1
for i in range(N):
d[i*batch_size:(i+1)*batch_size] = np.sqrt(np.sum((
x[i*batch_size:(i+1)*batch_size,:,:,None] - \
y[i*batch_size:(i+1)*batch_size,:,None,:])**2, axis=-1))
You can use array broadcasting, like this:
import numpy as np
examples = np.random.uniform(size=(5, 6, 7, 3))
N, F, J, C = examples.shape
# deltas.shape == (N, F, J, J, C) - Cartesian deltas
deltas = examples.reshape(N, F, J, 1, C) - examples.reshape(N, F, 1, J, C)
# distances.shape == (N, F, J, J)
distances = np.sqrt((deltas**2).sum(axis=-1), dtype=np.float32)
del deltas # release memory (only needed for interactive use)
This is a bit memory-hungry: with the values of N, F, J, C that you mentioned, the intermediate results (deltas) will take 16 GB, assuming double precision. It will be more efficient (6x less memory and better use of cache) if you preallocate the output array in single precision and loop over the N axis:
distances = np.empty((N, F, J, J))
for i, ex in enumerate(examples):
# deltas.shape = (F, J, J, C) - Cartesian deltas
deltas = ex.reshape(F, J, 1, C) - ex.reshape(F, 1, J, C)
distances[i] = np.sqrt((deltas**2).sum(axis=-1))
In Python, I have a matrix K of dimensions (N x N). I want to normalize K by dividing every entry K_ij by sqrt(K_(i,i)*K_(j,j)). What is a fast way to achieve this in Python without iterating through every entry?
My current solution is:
import numpy as np
K = np.random.rand(3,3)
diag = np.diag(K)
for i in range(np.shape(K)[0]):
for j in range(np.shape(K)[1]):
K[i,j] = K[i,j]/np.sqrt(diag[i]*diag[j])
Of course you have to iterate through every entry, at least internally. For square matrices:
K / np.sqrt(np.einsum('ii,jj->ij', K, K))
If the matrix is not square, you first have to define what should replace the "missing" values K[i,i] where i > j etc.
Alternative: use numba to leave your loop as is, get free speedup, and even avoid intermediate allocation:
#njit
def normalize(K):
M = np.empty_like(K)
m, n = K.shape
for i in range(m):
Kii = K[i,i]
for j in range(n):
Kjj = K[j,j]
M[i,j] = K[i,j] / np.sqrt(Kii * Kjj)
return M
I have a 2D numpy array which represents a grayscale image. I need to extract every N x N sub-array within that array, with a specified overlap between sub-arrays, and calculate a property such as the mean, standard deviation, or median.
The code below performs this task but is quite slow because it uses Python for loops. Any ideas on how to vectorize this calculation or otherwise speed it up?
import numpy as np
img = np.random.randn(100, 100)
N = 4
step = 2
h, w = img.shape
out = []
for i in range(0, h - N, step):
outr = []
for j in range(0, w - N, step):
outr.append(np.mean(img[i:i+N, j:j+N]))
out.append(outr)
out = np.array(out)
For mean and standard deviation, there is a fast cumsum based solution.
Here are timings for a 500x200 image, 30x20 window and step sizes 5 and 3. For comparison I use skimage.util.view_as_windows with numpy mean and std.
mn + sd using cumsum 1.1531693299184553 ms
mn using view_as_windows 3.495307120028883 ms
sd using view_as_windows 21.855629019846674 ms
Code:
import numpy as np
from math import gcd
from timeit import timeit
def wsum2d(A, winsz, stepsz, canoverwriteA=False):
M, N = A.shape
m, n = winsz
i, j = stepsz
for X, x, s in ((M, m, i), (N, n, j)):
g = gcd(x, s)
if g > 1:
X //= g
x //= g
s //= g
A = A[:X*g].reshape(X, g, -1).sum(axis=1)
elif not canoverwriteA:
A = A.copy()
canoverwriteA = True
A[x:] -= A[:-x]
A = A.cumsum(axis=0)[x-1::s]
A = A.T
return A
def w2dmnsd(A, winsz, stepsz):
# combine A and A*A into a complex, so overheads apply only once
M21 = wsum2d(A*(A+1j), winsz, stepsz, True)
M2, mean_ = M21.real / np.prod(winsz), M21.imag / np.prod(winsz)
sd = np.sqrt(M2 - mean_*mean_)
return mean_, sd
# test
np.random.seed(0)
A = np.random.random((500, 200))
wsz = (30, 20)
stpsz = (5, 3)
mn, sd = w2dmnsd(A, wsz, stpsz)
from skimage.util import view_as_windows
Av = view_as_windows(A, wsz, stpsz) # this emits a warning on my system
assert np.allclose(mn, np.mean(Av, axis=(2, 3)))
assert np.allclose(sd, np.std(Av, axis=(2, 3)))
from timeit import repeat
print('mn + sd using cumsum ', min(repeat(lambda: w2dmnsd(A, wsz, stpsz), number=100))*10, 'ms')
print('mn using view_as_windows', min(repeat(lambda: np.mean(Av, axis=(2, 3)), number=100))*10, 'ms')
print('sd using view_as_windows', min(repeat(lambda: np.std(Av, axis=(2, 3)), number=100))*10, 'ms')
If Numba is an option the only thing to do is to avoid the list appends (It does work with list appends too, but slower.
To make use of parallization too, rewrote the implementation a bit to avoid the step within range, which is not supported when using parfor.
Example
#nb.njit(error_model='numpy',parallel=True)
def calc_p(img,N,step):
h,w=img.shape
i_w=(h - N)//step
j_w=(w - N)//step
out = np.empty((i_w,j_w))
for i in nb.prange(0, i_w):
for j in range(0, j_w):
out[i,j]=np.std(img[i*step:i*step+N, j*step:j*step+N])
return out
def calc_n(img,N,step):
h, w = img.shape
out = []
for i in range(0, h - N, step):
outr = []
for j in range(0, w - N, step):
outr.append(np.std(img[i:i+N, j:j+N]))
out.append(outr)
return(np.array(out))
Timings
All timings are without compilation overhead of about 0.5s (the first call to the function is excluded from the timings).
#Data
img = np.random.randn(100, 100)
N = 4
step = 2
calc_n :17ms
calc_p :0.033ms
Because this is actually a rolling mean there is further room for improvement if N gets larger.
You could use scikit-image block_reduce:
So your code becomes:
import numpy as np
import skimage.measure
N = 4
# Your main array
a = np.arange(9).reshape(3,3)
mean = skimage.measure.block_reduce(a, (N,N), np.mean)
std_dev = skimage.measure.block_reduce(a, (N,N), np.std)
median = skimage.measure.block_reduce(a, (N,N), np.median)
However, the above code only works for strides/steps of size 1.
For mean, you could use mean pooling which is available in any modern day ML package. As for median and standard deviation, this seems the right approach.
The general case can be solved using scipy.ndimage.generic_filter:
import numpy as np
from scipy.ndimage import generic_filter
img = np.random.randn(100, 100)
N = 4
filtered = generic_filter(img.astype(np.float), np.std, size=N)
step = 2
output = filtered[::step, ::step]
However, this may actually run not much faster than a simple for loop.
To apply a mean and median filter you can use skimage.rank.mean and skimage.rank.median, respectively, which should be faster. There is also scipy.ndimage.median_filter. Otherwise, the mean can also be effectively computed through simple convolution with an (N, N) array with values 1./N^2. For the standard deviation you probably have to bite the bullet and use generic_filter unless your step size is larger or equal to N.
Converting a collaborative filtering code to use sparse matrices I'm puzzling on the following problem: given two full matrices X (m by l) and Theta (n by l), and a sparse matrix R (m by n), is there a fast way to calculate the sparse inner product . Large dimensions are m and n (order 100000), while l is small (order 10). This is probably a fairly common operation for big data since it shows up in the cost function of most linear regression problems, so I'd expect a solution built into scipy.sparse, but I haven't found anything obvious yet.
The naive way to do this in python is R.multiply(XTheta.T), but this will result in evaluation of the full matrix XTheta.T (m by n, order 100000**2) which occupies too much memory, then dumping most of the entries since R is sparse.
There is a pseudo solution already here on stackoverflow, but it is non-sparse in one step:
def sparse_mult_notreally(a, b, coords):
rows, cols = coords
rows, r_idx = np.unique(rows, return_inverse=True)
cols, c_idx = np.unique(cols, return_inverse=True)
C = np.array(np.dot(a[rows, :], b[:, cols])) # this operation is dense
return sp.coo_matrix( (C[r_idx,c_idx],coords), (a.shape[0],b.shape[1]) )
This works fine, and fast, for me on small enough arrays, but it barfs on my big datasets with the following error:
... in sparse_mult(a, b, coords)
132 rows, r_idx = np.unique(rows, return_inverse=True)
133 cols, c_idx = np.unique(cols, return_inverse=True)
--> 134 C = np.array(np.dot(a[rows, :], b[:, cols])) # this operation is not sparse
135 return sp.coo_matrix( (C[r_idx,c_idx],coords), (a.shape[0],b.shape[1]) )
ValueError: array is too big.
A solution which IS actually sparse, but very slow, is:
def sparse_mult(a, b, coords):
rows, cols = coords
n = len(rows)
C = np.array([ float(a[rows[i],:]*b[:,cols[i]]) for i in range(n) ]) # this is sparse, but VERY slow
return sp.coo_matrix( (C,coords), (a.shape[0],b.shape[1]) )
Does anyone know a fast, fully sparse way to do this?
I profiled 4 different solutions to your problem, and it looks like for any size of the array, the numba jit solution is the best. A close second is #Alexander's cython solution.
Here are the results (M is the number of rows in the x array):
M = 1000
function sparse_dense took 0.03 sec.
function sparse_loop took 0.07 sec.
function sparse_numba took 0.00 sec.
function sparse_cython took 0.09 sec.
M = 10000
function sparse_dense took 2.88 sec.
function sparse_loop took 0.68 sec.
function sparse_numba took 0.00 sec.
function sparse_cython took 0.01 sec.
M = 100000
function sparse_dense ran out of memory
function sparse_loop took 6.84 sec.
function sparse_numba took 0.09 sec.
function sparse_cython took 0.12 sec.
The script I used to profile these methods is:
import numpy as np
from scipy.sparse import coo_matrix
from numba import autojit, jit, float64, int32
import pyximport
pyximport.install(setup_args={"script_args":["--compiler=mingw32"],
"include_dirs":np.get_include()},
reload_support=True)
def sparse_dense(a,b,c):
return coo_matrix(c.multiply(np.dot(a,b)))
def sparse_loop(a,b,c):
"""Multiply sparse matrix `c` by np.dot(a,b) by looping over non-zero
entries in `c` and using `np.dot()` for each entry."""
N = c.size
data = np.empty(N,dtype=float)
for i in range(N):
data[i] = c.data[i]*np.dot(a[c.row[i],:],b[:,c.col[i]])
return coo_matrix((data,(c.row,c.col)),shape=(a.shape[0],b.shape[1]))
##autojit
def _sparse_mult4(a,b,cd,cr,cc):
N = cd.size
data = np.empty_like(cd)
for i in range(N):
num = 0.0
for j in range(a.shape[1]):
num += a[cr[i],j]*b[j,cc[i]]
data[i] = cd[i]*num
return data
_fast_sparse_mult4 = \
jit(float64[:,:](float64[:,:],float64[:,:],float64[:],int32[:],int32[:]))(_sparse_mult4)
def sparse_numba(a,b,c):
"""Multiply sparse matrix `c` by np.dot(a,b) using Numba's jit."""
assert c.shape == (a.shape[0],b.shape[1])
data = _fast_sparse_mult4(a,b,c.data,c.row,c.col)
return coo_matrix((data,(c.row,c.col)),shape=(a.shape[0],b.shape[1]))
def sparse_cython(a, b, c):
"""Computes c.multiply(np.dot(a,b)) using cython."""
from sparse_mult_c import sparse_mult_c
data = np.empty_like(c.data)
sparse_mult_c(a,b,c.data,c.row,c.col,data)
return coo_matrix((data,(c.row,c.col)),shape=(a.shape[0],b.shape[1]))
def unique_rows(a):
a = np.ascontiguousarray(a)
unique_a = np.unique(a.view([('', a.dtype)]*a.shape[1]))
return unique_a.view(a.dtype).reshape((unique_a.shape[0], a.shape[1]))
if __name__ == '__main__':
import time
for M in [1000,10000,100000]:
print 'M = %i' % M
N = M + 2
L = 10
x = np.random.rand(M,L)
t = np.random.rand(N,L).T
# number of non-zero entries in sparse r matrix
S = M*10
row = np.random.randint(M,size=S)
col = np.random.randint(N,size=S)
# remove duplicate rows and columns
row, col = unique_rows(np.dstack((row,col)).squeeze()).T
data = np.random.rand(row.size)
r = coo_matrix((data,(row,col)),shape=(M,N))
a2 = sparse_loop(x,t,r)
for f in [sparse_dense,sparse_loop,sparse_numba,sparse_cython]:
t0 = time.time()
try:
a = f(x,t,r)
except MemoryError:
print 'function %s ran out of memory' % f.__name__
continue
elapsed = time.time()-t0
try:
diff = abs(a-a2)
if diff.nnz > 0:
assert np.max(abs(a-a2).data) < 1e-5
except AssertionError:
print f.__name__
raise
print 'function %s took %.2f sec.' % (f.__name__,elapsed)
The cython function is a slightly modified version of #Alexander's code:
# working from tutorial at: http://docs.cython.org/src/tutorial/numpy.html
cimport numpy as np
# Turn bounds checking back on if there are ANY problems!
cimport cython
#cython.boundscheck(False) # turn of bounds-checking for entire function
def sparse_mult_c(np.ndarray[np.float64_t, ndim=2] a,
np.ndarray[np.float64_t, ndim=2] b,
np.ndarray[np.float64_t, ndim=1] data,
np.ndarray[np.int32_t, ndim=1] rows,
np.ndarray[np.int32_t, ndim=1] cols,
np.ndarray[np.float64_t, ndim=1] out):
cdef int n = rows.shape[0]
cdef int k = a.shape[1]
cdef int i,j
cdef double num
for i in range(n):
num = 0.0
for j in range(k):
num += a[rows[i],j] * b[j,cols[i]]
out[i] = data[i]*num
Based on the extra information on the comments, I think what's throwing you off is the call to np.unique. Try the following approach:
import numpy as np
import scipy.sparse as sps
from numpy.core.umath_tests import inner1d
n = 100000
x = np.random.rand(n, 10)
theta = np.random.rand(n, 10)
rows = np.arange(n)
cols = np.arange(n)
np.random.shuffle(rows)
np.random.shuffle(cols)
def sparse_multiply(x, theta, rows, cols):
data = inner1d(x[rows], theta[cols])
return sps.coo_matrix((data, (rows, cols)),
shape=(x.shape[0], theta.shape[0]))
I get the following timings:
n = 1000
%timeit sparse_multiply(x, theta, rows, cols)
1000 loops, best of 3: 465 us per loop
n = 10000
%timeit sparse_multiply(x, theta, rows, cols)
100 loops, best of 3: 4.29 ms per loop
n = 100000
%timeit sparse_multiply(x, theta, rows, cols)
10 loops, best of 3: 61.5 ms per loop
And of course, with n = 100:
>>> np.allclose(sparse_multiply(x, theta, rows, cols).toarray()[rows, cols],
x.dot(theta.T)[rows, cols])
>>> True
Haven't tested Jaime's answer yet (thanks again!), but I implemented another answer that works in the meantime using cython.
file sparse_mult_c.pyx:
# working from tutorial at: http://docs.cython.org/src/tutorial/numpy.html
cimport numpy as np
# Turn bounds checking back on if there are ANY problems!
cimport cython
#cython.boundscheck(False) # turn of bounds-checking for entire function
def sparse_mult_c(np.ndarray[np.float64_t, ndim=2] a,
np.ndarray[np.float64_t, ndim=2] b,
np.ndarray[np.int32_t, ndim=1] rows,
np.ndarray[np.int32_t, ndim=1] cols,
np.ndarray[np.float64_t, ndim=1] C ):
cdef int n = rows.shape[0]
cdef int k = a.shape[1]
cdef int i,j
for i in range(n):
for j in range(k):
C[i] += a[rows[i],j] * b[j,cols[i]]
Then compile it as per http://docs.cython.org/src/userguide/tutorial.html
Then in my python code, I include the following:
def sparse_mult(a, b, coords):
#a,b are np.ndarrays
from sparse_mult_c import sparse_mult_c
rows, cols = coords
C = np.zeros(rows.shape[0])
sparse_mult_c(a,b,rows,cols,C)
return sp.coo_matrix( (C,coords), (a.shape[0],b.shape[1]) )
This works fully sparse and also runs faster than even the original (memory-inefficient for me) solution.