Memory usage of `numpy.random` - python

Consider the following script:
import numpy as np
import tracemalloc
def zero_mem():
a = np.zeros((100, 100))
def nonzero_mem():
b = np.random.randn(100, 100)
if __name__ == "__main__":
tracemalloc.start()
zero_mem()
print(tracemalloc.get_traced_memory())
tracemalloc.stop()
tracemalloc.start()
nonzero_mem()
print(tracemalloc.get_traced_memory())
tracemalloc.stop()
The output running numpy 1.22.2 on python 3.8.10 is
(0, 80096)
(72, 80168)
The question is: why isn't the second row (0, 80168)? In other words: why is there memory still in use after nonzero_mem(), unlike when calling zero_mem()?

Related

parallelizing for loop in class with joblib: pickable error

I am trying to parallize the for loop inside the active_again function but I get this error message:
BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.
When I use the for loop which is commented out underneath the parallel line it is working fine. But at some point the code will have to work with larger numbers of C_Cells and I would need it to run faster.
# -*- coding: utf-8 -*-
import numpy as np
from scipy.spatial.distance import cdist
from joblib import Parallel, delayed #parallelise loops
import multiprocessing as mp
num_cores = mp.cpu_count()
loc = np.array([(-1,-1,0), (0,1,0),
(1,1,0), (-1,0,0), (1,0,0), (0,-1,0), (1,-1,0),(-1,1,0),
(-1,1,1), (0,1,1), (1,1,1), (-1,0,1), (0,0,1), (1,0,1),
(-1,-1,1), (0,-1,1), (1,-1,1), (-1,1,-1), (0,1,-1), (1,1,-1),
(-1,0,-1), (0,0,-1), (1,0,-1), (-1,-1,-1), (0,-1,-1), (1,-1,-1)])
class C_Cell(object):
def __init__(self, position0):
self.c_position0=position0
class C_Cells(object):
def __init__(self, pts, c_ncell):
self.c_cells = np.array([C_Cell(pts[i]) for i in\
range(c_ncell)]) #array of cell objects according to prior samplep
self.c_position = np.array([c_cell.c_position0 for c_cell in self.c_cells])
self.q_cells=np.empty((0,1))
self.q_position=np.empty((0,3),dtype=int)#np.array([(0,0,0)])
def go_inactive(self):
for i in range(5):
self.q_position=np.vstack((self.q_position, self.c_position[i]))
self.q_cells=np.append(self.q_cells, self.c_cells[i])
self.c_position=np.delete(self.c_position, (i), axis=0) #mother cell leaves division pool
self.c_cells=np.delete(self.c_cells, (i), axis=0)
def active_again(self,e_position,s_position):
num_cores = mp.cpu_count()
qs=np.array(range(self.q_cells.shape[0]))
def inner_active_again(s):
p=self.q_position[qs[::-1][s]]+loc
c = cdist(p, self.c_position)==0
p=p[~c.any(axis=1)]
d= cdist(p, e_position)==0
p=p[~d.any(axis=1)]
f=cdist(p, s_position)==0
p=p[~f.any(axis=1)]
g=cdist(p, self.q_position)==0
p=p[~g.any(axis=1)]
if p.shape[0]>0:
self.c_position=np.vstack((self.c_position, self.q_position[qs[::-1][s]])) #cell goes into quiescence pool
self.c_cells=np.append(self.c_cells, self.q_cells[qs[::-1][s]])
self.q_position=np.delete(self.q_position, (qs[::-1][s]), axis=0) #cell get removed out of division pool
self.q_cells=np.delete(self.q_cells, (qs[::-1][s]), axis=0)
Parallel(n_jobs=num_cores)(delayed(inner_active_again)(i) for i in range(self.q_cells.shape[0]))
# for i in range(self.q_cells.shape[0]):
# inner_active_again(i)
####################
a=loc+1
c=loc-1
b=np.vstack((a,loc,c))
e_position=np.array([[1,2,3]])
s_position=np.array([[3,3,3]])
s_cells=1
c_ncell=52
l=C_Cells(b, c_ncell)
t=0
while t<100:
t+=1
print('t=',t)
l.go_inactive()
l.active_again(e_position,s_position)
Any help regarding this issue would be highly appreciated.

Saving images in a loop faster than multithreading / multiprocessing

Here's a timed example of multiple image arrays of different sizes being saved in a loop as well as concurrently using threads / processes:
import tempfile
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from pathlib import Path
from time import perf_counter
import numpy as np
from cv2 import cv2
def save_img(idx, image, dst):
cv2.imwrite((Path(dst) / f'{idx}.jpg').as_posix(), image)
if __name__ == '__main__':
l1 = np.random.randint(0, 255, (100, 50, 50, 1))
l2 = np.random.randint(0, 255, (1000, 50, 50, 1))
l3 = np.random.randint(0, 255, (10000, 50, 50, 1))
temp_dir = tempfile.mkdtemp()
workers = 4
t1 = perf_counter()
for ll in l1, l2, l3:
t = perf_counter()
for i, img in enumerate(ll):
save_img(i, img, temp_dir)
print(f'Time for {len(ll)}: {perf_counter() - t} seconds')
for executor in ThreadPoolExecutor, ProcessPoolExecutor:
with executor(workers) as ex:
futures = [
ex.submit(save_img, i, img, temp_dir) for (i, img) in enumerate(ll)
]
for f in as_completed(futures):
f.result()
print(
f'Time for {len(ll)} ({executor.__name__}): {perf_counter() - t} seconds'
)
And I get these durations on my i5 mbp:
Time for 100: 0.09495482999999982 seconds
Time for 100 (ThreadPoolExecutor): 0.14151873999999998 seconds
Time for 100 (ProcessPoolExecutor): 1.5136184309999998 seconds
Time for 1000: 0.36972280300000016 seconds
Time for 1000 (ThreadPoolExecutor): 0.619205703 seconds
Time for 1000 (ProcessPoolExecutor): 2.016624468 seconds
Time for 10000: 4.232915643999999 seconds
Time for 10000 (ThreadPoolExecutor): 7.251599262 seconds
Time for 10000 (ProcessPoolExecutor): 13.963426469999998 seconds
Aren't threads / processes expected to require less time to achieve the same thing? and why not in this case?
The timings in the code are wrong because the timer t is not reset before testing the Pools. Nevertheless, the relative order of the timings are correct. A possible code with a timer reset is:
import tempfile
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from pathlib import Path
from time import perf_counter
import numpy as np
from cv2 import cv2
def save_img(idx, image, dst):
cv2.imwrite((Path(dst) / f'{idx}.jpg').as_posix(), image)
if __name__ == '__main__':
l1 = np.random.randint(0, 255, (100, 50, 50, 1))
l2 = np.random.randint(0, 255, (1000, 50, 50, 1))
l3 = np.random.randint(0, 255, (10000, 50, 50, 1))
temp_dir = tempfile.mkdtemp()
workers = 4
for ll in l1, l2, l3:
t = perf_counter()
for i, img in enumerate(ll):
save_img(i, img, temp_dir)
print(f'Time for {len(ll)}: {perf_counter() - t} seconds')
for executor in ThreadPoolExecutor, ProcessPoolExecutor:
t = perf_counter()
with executor(workers) as ex:
futures = [
ex.submit(save_img, i, img, temp_dir) for (i, img) in enumerate(ll)
]
for f in as_completed(futures):
f.result()
print(
f'Time for {len(ll)} ({executor.__name__}): {perf_counter() - t} seconds'
)
Multithreading is faster specially for I/O bound processes. In this case, compressing the images is cpu-intensive, so depending on the implementation of OpenCV and of the python wrapper, multithreading can be much slower. In many cases the culprit is CPython's GIL, but I am not sure if this is the case (I do not know if the GIL is released during the imwrite call). In my setup (i7 8th gen), Threading is as fast as the loop for 100 images and barely faster for 1000 and 10000 images. If ThreadPoolExecutor reuses threads, there is an overhead involved in assigning a new task to an existing thread. If it does not reuses threads, there is an overhead involved in launching a new thread.
Multiprocessing circumvents the GIL issue, but has some other problems. First, pickling the data to pass between processes takes some time, and in the case of images it can be very expensive. Second, in the case of windows, spawning a new process takes a lot of time. A simple test to see the overhead (both for processes and threads) is to change the save_image function by one that does nothing, but still need pickling, etc:
def save_img(idx, image, dst):
if idx != idx:
print("impossible!")
and by a similar one without parameters to see the overhead of spawning the processes, etc.
The timings in my setup show that 2.3 seconds are needed just to spawn the 10000 processes and 0.6 extra seconds for pickling, which is much more than the time needed for processing.
A way to improve the throughput and keep the overhead to a minimum is to break the work on chunks, and submit each chunk to the worker:
import tempfile
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from pathlib import Path
from time import perf_counter
import numpy as np
from cv2 import cv2
def save_img(idx, image, dst):
cv2.imwrite((Path(dst) / f'{idx}.jpg').as_posix(), image)
def multi_save_img(idx_start, images, dst):
for idx, image in zip(range(idx_start, idx_start + len(images)), images):
cv2.imwrite((Path(dst) / f'{idx}.jpg').as_posix(), image)
if __name__ == '__main__':
l1 = np.random.randint(0, 255, (100, 50, 50, 1))
l2 = np.random.randint(0, 255, (1000, 50, 50, 1))
l3 = np.random.randint(0, 255, (10000, 50, 50, 1))
temp_dir = tempfile.mkdtemp()
workers = 4
for ll in l1, l2, l3:
t = perf_counter()
for i, img in enumerate(ll):
save_img(i, img, temp_dir)
print(f'Time for {len(ll)}: {perf_counter() - t} seconds')
chunk_size = len(ll)//workers
ends = [chunk_size * (_+1) for _ in range(workers)]
ends[-1] += len(ll) % workers
starts = [chunk_size * _ for _ in range(workers)]
for executor in ThreadPoolExecutor, ProcessPoolExecutor:
t = perf_counter()
with executor(workers) as ex:
futures = [
ex.submit(multi_save_img, start, ll[start:end], temp_dir) for (start, end) in zip(starts, ends)
]
for f in as_completed(futures):
f.result()
print(
f'Time for {len(ll)} ({executor.__name__}): {perf_counter() - t} seconds'
)
This should give you a significant boost over a simple for, both for a multiprocessing and multithreading approach.

How to mock an object that is an input to a function?

Usually when I mock, I have the following type of setup
# my_script.py
import numpy as np
def my_func(x):
out = np.power(x, 2)
return out
then to test the numpy power call in my_script:
# test_myscript.py
import numpy as np
import unittest
import mock
from my_script import my_func
class TestMyScript(unittest.TestCase):
#mock.patch("my_script.np")
def test_my_func(self, mock_os):
"""Test that numpy.power was called"""
a = np.array([1, 2, 3])
my_func(a)
mock_os.power.assert_called_with(a, 2)
if __name__ == '__main__':
unittest.main()
This works fine.
But now if the situation changes, and say I give the numpy module as an argument into my_func; I don't know how to mock numpy in this case.
How would I mock numpy in the function below in the same way as it was mocked in test_myscript above?
Note that numpy will not be imported in my_script.py but will instead be imported in a separate script that runs functions from my_script.py.
# my_script.py
# numpy NOT imported in this script!
def my_func(x, numpy):
out = numpy.power(x, 2)
return out
EDIT:
Based on #Daniel Roseman's comment, I am including some more code to be explicit on how the functions are called
# main_script.py
import numpy as np
from my_script import my_func
def main():
a = np.array([1, 2, 3])
my_func(a, np) # numpy is passed into `my_func`
Then to test, I am trying the below
# test_myscript.py
import numpy as np
import unittest
import mock
from my_script import my_func
class TestMyScript(unittest.TestCase):
#mock.patch("main_script.np") # import from main_script since numpy is imported here
def test_my_func(self, mock_os):
"""Test that numpy.power was called"""
a = np.array([1, 2, 3])
my_func(a)
mock_os.power.assert_called_with(a, 2)
if __name__ == '__main__':
unittest.main()
But this fails with
Ran 1 test in 0.154s
>>> FAILED (failures=1)
>>> AssertionError: Expected 'power' to have been called.
I found that using the unittest.mock.Mock object here worked best.
So if we have:
# my_script.py
def my_func(x, numpy):
out = numpy.power(x, 2)
return out
Then to test it we have:
# test_myscript.py
import numpy as np
import unittest
from unittest.mock import Mock
from my_script import my_func
numpy_mock = Mock()
class TestMyScript(unittest.TestCase):
def test_my_func(self):
"""Test that numpy.power was called"""
a = np.array([1, 2, 3])
_ = my_func(a, numpy_mock) # pass the mocked object here
numpy_mock.power.assert_called_once_with(a, 2)
if __name__ == '__main__':
unittest.main()
Which passes the test

Parallel execution of a list of functions

So using the multiprocess module it is easy to run a function in parallel with different arguments like this:
from multiprocessing import Pool
def f(x):
return x**2
p = Pool(2)
print(p.map(f, [1, 2]))
But I'm interested in executing a list of functions on the same argument. Suppose I have the following two functions:
def f(x):
return x**2
def g(x):
return x**3 + 2
How can I execute them in parallel for the same argument (e.g. x=1)?
You can use Pool.apply_async() for that. You bundle up tasks in the form of (function, argument_tuple) and feed every task to apply_async().
from multiprocessing import Pool
from itertools import repeat
def f(x):
for _ in range(int(50e6)): # dummy computation
pass
return x ** 2
def g(x):
for _ in range(int(50e6)): # dummy computation
pass
return x ** 3
def parallelize(n_workers, functions, arguments):
# if you need this multiple times, instantiate the pool outside and
# pass it in as dependency to spare recreation all over again
with Pool(n_workers) as pool:
tasks = zip(functions, repeat(arguments))
futures = [pool.apply_async(*t) for t in tasks]
results = [fut.get() for fut in futures]
return results
if __name__ == '__main__':
N_WORKERS = 2
functions = f, g
results = parallelize(N_WORKERS, functions, arguments=(10,))
print(results)
Example Output:
[100, 1000]
Process finished with exit code 0
You can get a tuple returned. This could be done quite easily and in a very compact way using the lightweight module: joblib. I recommend joblib because it is lightweight
from joblib import Parallel, delayed
import multiprocessing
import timeit
# Implementation 1
def f(x):
return x**2, x**3 + 2
#Implementation 2 for a more sophisticated second or more functions
def g(x):
return x**3 + 2
def f(x):
return x**2, g(x)
if __name__ == "__main__":
inputs = [i for i in range(32)]
num_cores = multiprocessing.cpu_count()
t1 = timeit.Timer()
result = Parallel(n_jobs=num_cores)(delayed(f)(i) for i in inputs)
print(t1.timeit(1))
Using multiprocessing.Pool as you already have in the question
from multiprocessing import Pool, cpu_count
import timeit
def g(x):
return x**3 + 2
def f(x):
return x**2, g(x)
if __name__ == "__main__":
inputs = [i for i in range(32)]
num_cores = cpu_count()
p = Pool(num_cores)
t1 = timeit.Timer()
result = p.map(f, inputs)
print(t1.timeit(1))
print(result)
Example Output:
print(result)
[(0, 2), (1, 3), (4, 10), (9, 29), (16, 66), (25, 127), (36, 218), (49, 345),
(64, 514), (81, 731), (100, 1002), (121, 1333), (144, 1730), (169, 2199),
(196, 2746), (225, 3377), (256, 4098), (289, 4915), (324, 5834), (361, 6861),
(400, 8002), (441, 9263), (484, 10650), (529, 12169), (576, 13826), (625,
15627), (676, 17578), (729, 19685), (784, 21954), (841, 24391), (900, 27002),
(961, 29793)]
print(t1.timeit(1))
5.000001692678779e-07 #(with 16 cpus and 64 Gb RAM)
for: inputs = range(2000), it took the time:
1.100000190490391e-06

cuda code error within numbapro

import numpy
import numpy as np
from numbapro import cuda
#cuda.autojit
def foo(aryA, aryB,out):
d_ary1 = cuda.to_device(aryA)
d_ary2 = cuda.to_device(aryB)
#dd = numpy.empty(10, dtype=np.int32)
d_ary1.copy_to_host(out)
griddim = 1, 2
blockdim = 3, 4
aryA = numpy.arange(10, dtype=np.int32)
aryB = numpy.arange(10, dtype=np.int32)
out = numpy.empty(10, dtype=np.int32)
foo[griddim, blockdim](aryA, aryB,out)
Exception: Caused by input line 11:
can only get attribute from globals, complex numbers or arrays
I am new to numbapro, hints are needed!
The #cuda.autotjit marks and compiles foo() as a CUDA kernel. The memory transfer operations should be placed outside of the kernel. It should look like the following code:
import numpy
from numbapro import cuda
#cuda.autojit
def foo(aryA, aryB ,out):
# do something here
i = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
out[i] = aryA[i] + aryB[i]
griddim = 1, 2
blockdim = 3, 4
aryA = numpy.arange(10, dtype=numpy.int32)
aryB = numpy.arange(10, dtype=numpy.int32)
out = numpy.empty(10, dtype=numpy.int32)
# transfer memory
d_ary1 = cuda.to_device(aryA)
d_ary2 = cuda.to_device(aryB)
d_out = cuda.device_array_like(aryA) # like numpy.empty_like() but for GPU
# launch kernel
foo[griddim, blockdim](aryA, aryB, d_out)
# transfer memory device to host
d_out.copy_to_host(out)
print out
I recommend new NumbaPro users to look at the examples in https://github.com/ContinuumIO/numbapro-examples.

Categories