the first calculation with torch.einsum is much slower - python

When I run several calculations with torch.einsum in a row, the first one is always much slower than the following calculations.
The following code and plot illustrates the problem:
import torch as tor
from timeit import default_timer as timer
N = 1000
L = 10
time_arr = np.zeros(L)
for i in range(L):
a = tor.randn(N, N).to("cuda:0") #3 random 1000x1000 matrices for each cycle
b = tor.randn(N, N).to("cuda:0")
c = tor.randn(N, N).to("cuda:0")
time_start = timer()
tor.einsum("ij, kj",tor.einsum("ij, ja", aa, ab), ac)
time_end = timer()
time_arr[i] = time_end - time_start
Plot of the different times for each cylce of the loop

Related

Multiprocessing not using whole CPU

I'm testing python's module "multiprocessing". I'm trying to compute pi using a montecarlo technique using my 12 threads ryzen 5 5600.
The problem is that my cpu is not fully used, instead only 47% is used. I leave you my code below, changing the value of n_cpu leads to not so different core usage, instead increasing N by 1 order of magnitude can increase the load up to 77%... but i believed that N shouldn't affect the number of processes...
Please help me understand how to correctly parallelize my code, thanks.
import random
import math
import numpy as np
import multiprocessing
from multiprocessing import Pool
def sample(n):
n_inside_circle = 0
for i in range(n):
x = random.random()
y = random.random()
if x**2 + y**2 < 1.0:
n_inside_circle += 1
return n_inside_circle
N_test=1000
N=12*10**4
n_cpu = 12
pi=0
for j in range(N_test):
part_count=[int(N/n_cpu)] * n_cpu
pool = Pool(processes=n_cpu)
results = pool.map(sample, part_count)
pool.close()
pi += sum(results)/(N*1.0)*4
print(pi/N_test)
The lack of cpu use is because you are sending chunks of data to multiple new process pools instead of all at once to a single process pool.
simply using
pool = Pool(processes=n_cpu)
for j in range(N_test):
part_count=[int(N/n_cpu)] * n_cpu
results = pool.map(sample, part_count)
pi += sum(results)/(N*1.0)*4
pool.close()
should have some speed up
To optimize this further
We can change the way the jobs are split up to have more samples for a single process.
We can use Numpy's vectorized random functions that will run faster than random.random().
Finally for the last bit of speed, we can use numba with a threadpool to reduce overhead even more.
import time
import numpy as np
from multiprocessing.pool import ThreadPool
from numba import jit
#jit(nogil=True, parallel=True, fastmath=True)
def sample(n):
x = np.random.random(n)
y = np.random.random(n)
inside_circle = np.square(x) + np.square(y) < 1.0
return int(np.sum(inside_circle))
total_samples = int(3e9)
function_limit = int(1e7)
n_cpu = 12
pi=0
assert total_samples%function_limit == 0
start = time.perf_counter()
with ThreadPool(n_cpu) as pool:
part_count=[function_limit] * (total_samples//function_limit)
results = pool.map(sample, part_count)
pi = 4*sum(results)/(total_samples)
end = time.perf_counter()
print(pi)
print(round(end-start,3), "seconds taken")
resulting in
3.141589756
6.982 seconds taken

I am using multiprocessing pool to updte matrix values, but the values don't change

I have a simple function that takes a matrix 'h' and some more arguments, and adds some computed vector on top of a single column of the matrix. Then I apply that function for each column of a matrix - I have a code that does this sequentially and everything is fine there; but, since the column-wise operations are independent I want to do it in parallel. However, when I apply the same function with 'multiprocessing.Pool()', the values of the matrix don't change from the initial value.
Bellow goes a script with both sequential and parallel implementation. In the end, the values of matrices 'h1' and 'h2' should be the same, but they are not, and actually 'h2' has the same value that it had in the beginning (that is, as a matrix 'deltas').
I am not a programmer, and don't have much experience with multiprocessing library, so maybe I am doing something stupid here...
from multiprocessing import Pool
from multiprocessing import set_start_method
import time
import numpy as np
from functools import partial
def h_single_ctr(ctr,C,keys1,bs1,h):
indices1 = np.where(keys1[:,1]==ctr)[0]
indices2 = np.where(keys1[:,0]==ctr)[0]
h[:,ctr] += (C[keys1[:,0][indices1]]).dot(bs1[indices1])
h[:,ctr] += (C[keys1[:,1][indices2]]).dot(bs1[indices2])
if __name__ == '__main__':
m,n = 100,15000
deltas = np.random.rand(n,m)
C = np.random.rand(m)
mbs = 150
bs1 = np.random.rand(mbs,n)
keys1 = np.random.randint(m,size=(mbs,2))
# Sequential
tic = time.time()
h1 = 0. + deltas
for ctr in range(m):
# Update each column of a matrix h1, using function h_single_ctr
h_single_ctr(ctr,C,keys1,bs1,h1)
toc = time.time()
print('Done in {:.4f} seconds'.format(toc-tic))
# Multiprocessing / Pool
tic = time.time()
h2 = 0. + deltas
p = Pool(5)
# Update each column of a matrix h2, using function h_single_ctr, in parallel
p.map(partial(h_single_ctr,C=C,keys1=keys1,bs1=bs1,h=h2), range(m))
p.close()
p.join()
toc = time.time()
print('Done in {:.4f} seconds'.format(toc-tic))
print(np.linalg.norm(h1-h2))
You can create a new variable for each process and then add them all on top of your global h. Notice that you need to have a one-dimensional array, and not a matrix within the process.
from multiprocessing import Process, Array
from threading import Thread
import time
import numpy as np
def h_single_ctr(ctr,C,keys1,bs1,h):
indices1 = np.where(keys1[:,1]==ctr)[0]
indices2 = np.where(keys1[:,0]==ctr)[0]
h[:,ctr] += (C[keys1[:,0][indices1]]).dot(bs1[indices1])
h[:,ctr] += (C[keys1[:,1][indices2]]).dot(bs1[indices2])
def h_single_ctr2(ctr,C,keys1,bs1):
indices1 = np.where(keys1[:,1]==ctr)[0]
indices2 = np.where(keys1[:,0]==ctr)[0]
res = (C[keys1[:,0][indices1]]).dot(bs1[indices1])
res += (C[keys1[:,1][indices2]]).dot(bs1[indices2])
return res
def h_multiple_ctr(n,C,ctr_list,keys1,bs1,h):
for i in range(len(ctr_list)):
ctr = ctr_list[i]
res = h_single_ctr2(ctr,C,keys1,bs1)
h[(i*n):((i+1)*n)] += res
if __name__ == '__main__':
m,n = 100,15000
deltas = np.random.rand(n,m)
C = np.random.rand(m)
mbs = 150
bs1 = np.random.rand(mbs,n)
keys1 = np.random.randint(m,size=(mbs,2))
num_processes = 3
col_processes = [[i for i in range(j,m,num_processes)] for j in range(num_processes)] # column indices that each process will take
# Sequential
tic = time.time()
h1 = 0. + deltas
for ctr in range(m):
# Update each column of a matrix h1, using function h_single_ctr
h_single_ctr(ctr,C,keys1,bs1,h1)
toc = time.time()
print('Done in {:.4f} seconds'.format(toc-tic))
# Multiple processes
tic = time.time()
h2 = 0. + deltas
for pid in range(num_processes):
hi = Array('d', [0]*(len(col_processes[pid])*n), lock=False)
p = Process(target=h_multiple_ctr, args=(n,C,col_processes[pid],keys1,bs1,hi))
p.start()
p.join()
h2[:,col_processes[pid]] += np.reshape(hi,(len(col_processes[pid]),n)).T
toc = time.time()
print('Done in {:.4f} seconds'.format(toc-tic))
print(np.linalg.norm(h1-h2))

Neither for nor while loop run in linear time O(n) in python

I measured the runtimes of a for and a while loop in python and plotted them with matplotlib.
To my surprise, the graphs aren't looking that linear, especially the one of the for loop. It also took longer to loop through 600k numbers than it took to loop through 700k.
What did I do wrong or is it just python which does things differently?
import time
import matplotlib.pyplot as plt
time_while=[]
time_for=[]
for i in range(200000, 1000000+1, 100000):
t1 = time.time()
n = 0
while n < i:
n += 1
t2 = time.time()
time_while.append(round(t2-t1,5))
t1 = time.time()
for n in range(i):
n=n
t2 = time.time()
time_for.append(round(t2-t1,5))
x=["200k","300k","400k","500k","600k","700k","800k","900k","1Mio",]
plt.plot(x, time_while,label="while")
plt.plot(x, time_for,label="for")
plt.legend()
plt.show()
By making a slight modification to your code, adding a small summation within each loop I prolong the calculation time and the result will be more stable in terms of small fluctuations in available capacity on your cores. With this approach you clearly see the linearity that is expected.
The plot looks like this
You can see the code used below
import time
import matplotlib.pyplot as plt
time_while=[]
time_for=[]
for i in range(200000, 1000000+1, 100000):
t1 = time.time()
n = 0
while n < i:
sum(k for k in range(10))
n += 1
t2 = time.time()
time_while.append(round(t2-t1,5))
t1 = time.time()
for n in range(i):
sum(k for k in range(10))
n=n
t2 = time.time()
time_for.append(round(t2-t1,5))
x=["200k","300k","400k","500k","600k","700k","800k","900k","1Mio",]
plt.plot(x, time_while,label="while")
plt.plot(x, time_for,label="for")
plt.legend()
plt.show()

Python - Fastest / Best way to apply a function to each element of a numpy.array

I am wondering what is the fastest (or "best" due to some reason) method to apply a function to each element of a numpy array. I tried this method with a larger data set and it takes quite a while... Post your answer with the results (time in milliseconds) you got on my implementation and yours,as different HW will give different results on the same code
Please share your implementation between the 2 commented lines
import numpy as np
import time
# Some random data
x = np.random.rand(5,32,32,3)*255
x = x.astype(int)
# Defining some function
def normalize(x, a=0, b=1, x_min=0, x_max=255):
return a + (x - x_min)*(b - a)/(x_max-x_min)
## Start timer
start_time = time.time()
# ---------------------IMPLEMENTATION---------------------
# Apply Normalize function to each element in the array
n = np.vectorize(normalize)
x = n(x)
#_________________________________________________________
# Stop timer and show time in milliseconds
elapsed_time = time.time() - start_time
print("Time [ms] = " + str(elapsed_time*1000))
As pointed out by #sascha , I just need to apply the function to the whole array:
import numpy as np
import time
# Some random data
x = np.random.rand(5,32,32,3)*255
x = x.astype(int)
# Defining some function
def normalize(x, a=0, b=1, x_min=0, x_max=255):
return a + (x - x_min)*(b - a)/(x_max-x_min)
## Start timer
start_time = time.time()
# ---------------------IMPLEMENTATION---------------------
# Apply Normalize function to each element in the array
x = normalize(x)
#_________________________________________________________
# Stop timer and show time in milliseconds
elapsed_time = time.time() - start_time
print("Time [ms] = " + str(elapsed_time*1000))

for loop in python is 10x slower than matlab

I run python 2.7 and matlab R2010a on the same machine, doing nothing, and it gives me 10x different in speed
I looked online, and heard it should be the same order.
Python will further slow down as if statement and math operator in the for loop
My question: is this the reality? or there is some other way let them in the same speed order?
Here is python code
import time
start_time = time.time()
for r in xrange(1000):
for c in xrange(1000):
continue
elapsed_time = time.time() - start_time
print 'time cost = ',elapsed_time
Output: time cost = 0.0377440452576
Here is matlab code
tic
for i = 1:1000
for j = 1:1000
end
end
toc
Output: Escaped time is 0.004200 seconds
The reason this is happening is related to the JIT compiler, which is optimizing the MATLAB for loop. You can disable/enable the JIT accelerator using feature accel off and feature accel on. When you disable the accelerator, the times change dramatically.
MATLAB with accel on: Elapsed time is 0.009407 seconds.
MATLAB with accel off: Elapsed time is 0.287955 seconds.
python: time cost = 0.0511920452118
Thus the JIT accelerator is directly causing the speedup that you are noticing. There is another thing that you should consider, which is related to the way that you defined the iteration indices. In both cases, MATLAB and python, you used Iterators to define your loops. In MATLAB you create the actual values by adding the square brackets ([]), and in python you use range instead of xrange. When you make these changes
% MATLAB
for i = [1:1000]
for j = [1:1000]
# python
for r in range(1000):
for c in range(1000):
The times become
MATLAB with accel on: Elapsed time is 0.338701 seconds.
MATLAB with accel off: Elapsed time is 0.289220 seconds.
python: time cost = 0.0606048107147
One final consideration is if you were to add a quick computation to the loop. ie t=t+1. Then the times become
MATLAB with accel on: Elapsed time is 1.340830 seconds.
MATLAB with accel off: Elapsed time is 0.905956 seconds. (Yes off was faster)
python: time cost = 0.147221088409
I think that the moral here is that the computation speeds of for loops, out-of-the box, are comparable for extremely simple loops, depending on the situation. However, there are other, numerical tools in python which can speed things up significantly, numpy and PyPy have been brought up so far.
The basic Python implementation, CPython, is not meant to be super-speedy. If you need efficient matlab-style numerical manipulation, use the numpy package or an implementation of Python that is designed for fast work, such as PyPy or even Cython. (Writing a Python extension in C, which will of course be pretty fast, is also a possible solution, but in that case you may as well just use numpy and save yourself the effort.)
If Python execution performance is really crucial for you, you might take a look at PyPy
I did your test:
import time
for a in range(10):
start_time = time.time()
for r in xrange(1000):
for c in xrange(1000):
continue
elapsed_time = time.time()-start_time
print elapsed_time
with standard Python 2.7.3, I get:
0.0311839580536
0.0310959815979
0.0309510231018
0.0306520462036
0.0302460193634
0.0324130058289
0.0308878421783
0.0307397842407
0.0304911136627
0.0307500362396
whereas, using PyPy 1.9.0 (which corresponds to Python 2.7.2), I get:
0.00921821594238
0.0115230083466
0.00851202011108
0.00808095932007
0.00496387481689
0.00499391555786
0.00508499145508
0.00618195533752
0.005126953125
0.00482988357544
The acceleration of PyPy is really stunning and really becomes visible when its JIT compiler optimizations outweigh their cost. That's also why I introduced the extra for loop. For this example, absolutely no modification of the code was needed.
This is just my opinion, but I think the process is a bit more complex. Basically Matlab is an optimized layer of C, so with the appropriate initialization of matrices and minimization of function calls (avoid "." objects-like operators in Matlab) you obtain extremely different results. Consider the simple following example of wave generator with cosine function. Matlab time = 0.15 secs in practical debug session, Python time = 25 secs in practical debug session (Spyder), thus Python becomes 166x slower. Run directly by Python 3.7.4. machine the time is = 5 secs aprox, so still be a non negligible 33x.
MATLAB:
AW(1,:) = [800 , 0 ]; % [amp frec]
AW(2,:) = [300 , 4E-07];
AW(3,:) = [200 , 1E-06];
AW(4,:) = [ 50 , 4E-06];
AW(5,:) = [ 30 , 9E-06];
AW(6,:) = [ 20 , 3E-05];
AW(7,:) = [ 10 , 4E-05];
AW(8,:) = [ 9 , 5E-04];
AW(9,:) = [ 7 , 7E-04];
AW(10,:)= [ 5 , 8E-03];
phas = 0
tini = -2*365 *86400; % 2 years backwards in seconds
dt = 200; % step, 200 seconds
tfin = 0; % present
vec_t = ( tini: dt: tfin)'; % vector_time
nt = length(vec_t);
vec_t = vec_t - phas;
wave = zeros(nt,1);
for it = 1:nt
suma = 0;
t = vec_t(it,1);
for iW = 1:size(AW,1)
suma = suma + AW(iW,1)*cos(AW(iW,2)*t);
end
wave(it,1) = suma;
end
PYTHON:
import numpy as np
AW = np.zeros((10,2))
AW[0,:] = [800 , 0.0]
AW[1,:] = [300 , 4E-07]; # [amp frec]
AW[2,:] = [200 , 1E-06];
AW[3,:] = [ 50 , 4E-06];
AW[4,:] = [ 30 , 9E-06];
AW[5,:] = [ 20 , 3E-05];
AW[6,:] = [ 10 , 4E-05];
AW[7,:] = [ 9 , 5E-04];
AW[8,:] = [ 7 , 7E-04];
AW[9,:] = [ 5 , 8E-03];
phas = 0
tini = -2*365 *86400 # 2 years backwards
dt = 200
tfin = 0 # present
nt = round((tfin-tini)/dt) + 1
vec_t = np.linspace(tini,tfin1,nt) - phas
wave = np.zeros((nt))
for it in range(nt):
suma = 0
t = vec_t[fil]
for iW in range(np.size(AW,0)):
suma = suma + AW[iW,0]*np.cos(AW[iW,1]*t)
#endfor iW
wave[it] = suma
#endfor it
To deal such aspects in Python I would suggest to compile into executable directly to binary the numerical parts that may compromise the project (or for example C or Fortran into executable and be called by Python afterwards). Of course, other suggestions are appreciated.
I tested a FIR filter with MATLAB and same (adapted) code in Python, including a frequency sweep. The FIR filter is pretty huge, N = 100 order, I post below the two codes, but leave you here the timing results:
MATLAB: Elapsed time is 11.149704 seconds.
PYTHON: time cost = 247.8841781616211 seconds.
PYTHON IS 25 TIMES SLOWER !!!
MATLAB CODE (main):
f1 = 4000; % bandpass frequency (response = 1).
f2 = 4200; % bandreject frequency (response = 0).
N = 100; % FIR filter order.
k = 0:2*N;
fs = 44100; Ts = 1/fs; % Sampling freq. and time.
% FIR Filter numerator coefficients:
Nz = Ts*(f1+f2)*sinc((f2-f1)*Ts*(k-N)).*sinc((f2+f1)*Ts*(k-N));
f = 0:fs/2;
w = 2*pi*f;
z = exp(-i*w*Ts);
% Calculation of the expected response:
Hz = polyval(Nz,z).*z.^(-2*N);
figure(1)
plot(f,abs(Hz))
title('Gráfica Respuesta Filtro FIR (Filter Expected Response)')
xlabel('frecuencia f (Hz)')
ylabel('|H(f)|')
xlim([0, 5000])
grid on
% Sweep Frequency Test:
tic
% Start and Stop frequencies of sweep, t = tmax = 50 seconds = 5000 Hz frequency:
fmin = 1; fmax = 5000; tmax = 50;
t = 0:Ts:tmax;
phase = 2*pi*fmin*t + 2*pi*((fmax-fmin).*t.^2)/(2*tmax);
x = cos(phase);
y = filtro2(Nz, 1, x); % custom filter function, not using "filter" library here.
figure(2)
plot(t,y)
title('Gráfica Barrido en Frecuencia Filtro FIR (Freq. Sweep)')
xlabel('Tiempo Barrido: t = 10 seg = 1000 Hz')
ylabel('y(t)')
xlim([0, 50])
grid on
toc
MATLAB CUSTOM FILTER FUNCTION
function y = filtro2(Nz, Dz, x)
Nn = length(Nz);
Nd = length(Dz);
N = length(x);
Nm = max(Nn,Nd);
x1 = [zeros(Nm-1,1) ; x'];
y1 = zeros(Nm-1,1);
for n = Nm:N+Nm-1
y1(n) = Nz(Nn:-1:1)*x1(n-Nn+1:n)/Dz(1);
if Nd > 1
y1(n) = y1(n) - Dz(Nd:-1:2)*y1(n-Nd+1:n-1)/Dz(1);
end
end
y = y1(Nm:Nm+N-1);
end
PYTHON CODE (main):
import numpy as np
from matplotlib import pyplot as plt
import FiltroDigital as fd
import time
j = np.array([1j])
pi = np.pi
f1, f2 = 4000, 4200
N = 100
k = np.array(range(0,2*N+1),dtype='int')
fs = 44100; Ts = 1/fs;
Nz = Ts*(f1+f2)*np.sinc((f2-f1)*Ts*(k-N))*np.sinc((f2+f1)*Ts*(k-N));
f = np.arange(0, fs/2, 1)
w = 2*pi*f
z = np.exp(-j*w*Ts)
Hz = np.polyval(Nz,z)*z**(-2*N)
plt.figure(1)
plt.plot(f,abs(Hz))
plt.title("Gráfica Respuesta Filtro FIR")
plt.xlabel("frecuencia f (Hz)")
plt.ylabel("|H(f)|")
plt.xlim(0, 5000)
plt.grid()
plt.show()
start_time = time.time()
fmin = 1; fmax = 5000; tmax = 50;
t = np.arange(0, tmax, Ts)
fase = 2*pi*fmin*t + 2*pi*((fmax-fmin)*t**2)/(2*tmax)
x = np.cos(fase)
y = fd.filtro(Nz, [1], x)
plt.figure(2)
plt.plot(t,y)
plt.title("Gráfica Barrido en Frecuencia Filtro FIR")
plt.xlabel("Tiempo Barrido: t = 10 seg = 1000 Hz")
plt.ylabel("y(t)")
plt.xlim(0, 50)
plt.grid()
plt.show()
elapsed_time = time.time() - start_time
print('time cost = ', elapsed_time)
PYTHON CUSTOM FILTER FUNCTION
import numpy as np
def filtro(Nz, Dz, x):
Nn = len(Nz);
Nd = len(Dz);
Nz = np.array(Nz,dtype=float)
Dz = np.array(Dz,dtype=float)
x = np.array(x,dtype=float)
N = len(x);
Nm = max(Nn,Nd);
x1 = np.insert(x, 0, np.zeros((Nm-1,), dtype=float))
y1 = np.zeros((N+Nm-1,), dtype=float)
for n in range(Nm-1,N+Nm-1) :
y1[n] = sum(Nz*np.flip( x1[n-Nn+1:n+1]))/Dz[0] # = y1FIR[n]
if Nd > 1:
y1[n] = y1[n] - sum(Dz[1:]*np.flip( y1[n-Nd+1:n]))/Dz[0]
print(y1[n])
y = y1[Nm-1:]
return y

Categories