Increase speed by eleminating loops - python

I have the following problem. The code below successfully linear fits may data from 50 to 400 samples (I never have more than 400 samples and the first 50 are of horrendous quality). In the third dimension I will have the value of 7 and the fourth dimension can have values of up to 10000 therefore this loop "solution" would take alot of time. How can I not use a for loop and decrease my runtimes? Thank you for your help (I am pretty new to Python)
from sklearn.linear_model import TheilSenRegressor
import numpy as np
#ransac = linear_model.RANSACRegressor()
skip_v=50#number of values to be skipped
N=400
test_n=np.reshape(range(skip_v, N),(-1,1))
f_n=7
d4=np.shape(data)
a6=np.ones((f_n,d4[3]))
b6=np.ones((f_n,d4[3]))
for j in np.arange(d4[3]):
for i in np.arange(f_n):
theil = TheilSenRegressor(random_state=0).fit(test_n,np.log(data[skip_v:,3,i,j]))
a6[i,j]=theil.coef_
b6[i,j]=theil.intercept_

You can use multiprocessing to work your loop in parallel. The following code is not working. It just demonstrates how to do it. It is only useful, if your numbers are really big. Otherwise, doing in sequential is faster.
from sklearn.linear_model import TheilSenRegressor
import numpy as np
import multiprocessing as mp
from itertools import product
def worker_function(input_queue, output_queue, skip_v, test_n, data):
for task in iter(input_queue.get, 'STOP'):
i = task[0]
j = task[1]
theil = TheilSenRegressor(random_state=0).fit(test_n,np.log(data[skip_v:,3,i,j]))
output_queue.put([i, j, theil])
if __name__ == "__main__":
# define data here
f_n = 7
d4 = np.shape(data)
skip_v = 50
N=400
test_n=np.reshape(range(skip_v, N),(-1,1))
input_queue = mp.Queue()
output_queue = mp.Queue()
# here you create all combinations of j and i of your loop
list1 = range(f_n)
list2 = range(d4[3])
list3 = [list1, list2]
tasks = [p for p in product(*list3)]
numProc = 4
# start processes
process = [mp.Process(target=worker_function,
args=(input_queue, output_queue,
skip_v, test_n, data)) for x in range(numProc)]
for p in process:
p.start()
# queue tasks
for i in tasks:
input_queue.put(i)
# signal workers to stop after tasks are all done
for i in range(numProc):
input_queue.put('STOP')
# get the results
for i in range(len(tasks)):
res = output_queue.get(block=True) # wait for results
a6[res[0], res[1]] = res[2].coef_
b6[res[0], res[1]] = res[2].intercept_

Related

python multiprocess very slow

I'm having trouble using python multiprocess.
im trying with a minimal version of code:
import os
os.environ["OMP_NUM_THREADS"] = "1" # just in case the system uses multithrad somehow
os.environ["OPENBLAS_NUM_THREADS"] = "1" # just in case the system uses multithrad somehow
os.environ["MKL_NUM_THREADS"] = "1" # just in case the system uses multithrad somehow
os.environ["VECLIB_MAXIMUM_THREADS"] = "1" # just in case the system uses multithrad somehow
os.environ["NUMEXPR_NUM_THREADS"] = "1" # just in case the system uses multithrad somehow
import numpy as np
from datetime import datetime as dt
from multiprocessing import Pool
from pandas import DataFrame as DF
def trytrytryshare(times):
i = 0
for j in range(times):
i+=1
return
def trymultishare(thread = 70 , times = 10):
st = dt.now()
args_l = [(times,) for i in range(thread)]
print(st)
p = Pool(thread)
for i in range(len(args_l)):
p.apply_async(func = trytrytryshare, args = (args_l[i]))
p.close()
p.join()
timecost = (dt.now()-st).total_seconds()
print('%d threads finished in %f secs' %(thread,timecost))
return timecost
if __name__ == '__main__':
res = DF(columns = ['thread','timecost'])
n = 0
for j in range(5):
for i in range(1,8,3):
timecost = trymultishare(thread = i,times = int(1e8))
res.loc[n] = [i,timecost]
n+=1
timecost = trymultishare(thread = 70,times = int(1e8))
res.loc[n] = [70,timecost]
n+=1
res_sum = res.groupby('thread').mean()
res_sum['decay'] = res_sum.loc[1,'timecost'] / res_sum['timecost']
on my own computer (8cores):
on my server (80 cores, im the only one using it)
i tried again, make one thread job longer.
the decay is really bad....
any idea how to "fix" this, or this is just what i can get when using multi-process?
thanks
The way you're timing apply_async is flawed. You won't know when the subprocesses have completed unless you wait for their results.
It's a good idea to work out an optimum process pool size based on number of CPUs. The code that follows isn't necessarily the best for all cases but it's what I use.
You shouldn't set the pool size to the number of processes you intend to run. That's the whole point of using a pool.
So here's a simpler example of how you could test subprocess performance.
from multiprocessing import Pool
from time import perf_counter
from os import cpu_count
def process(n):
r = 0
for _ in range(n):
r += 1
return r
POOL = max(cpu_count()-2, 1)
N = 1_000_000
def main(procs):
# no need for pool size to be bigger than the numer of processes to be run
poolsize = min(POOL, procs)
with Pool(poolsize) as pool:
_start = perf_counter()
for result in [pool.apply_async(process, (N,)) for _ in range(procs)]:
result.wait() # wait for async processes to terminate
_end = perf_counter()
print(f'Duration for {procs} processes with pool size of {poolsize} = {_end-_start:.2f}s')
if __name__ == '__main__':
print(f'CPU count = {cpu_count()}')
for procs in range(10, 101, 10):
main(procs)
Output:
CPU count = 20
Duration for 10 processes with pool size of 10 = 0.12s
Duration for 20 processes with pool size of 18 = 0.19s
Duration for 30 processes with pool size of 18 = 0.18s
Duration for 40 processes with pool size of 18 = 0.28s
Duration for 50 processes with pool size of 18 = 0.30s
Duration for 60 processes with pool size of 18 = 0.39s
Duration for 70 processes with pool size of 18 = 0.42s
Duration for 80 processes with pool size of 18 = 0.45s
Duration for 90 processes with pool size of 18 = 0.54s
Duration for 100 processes with pool size of 18 = 0.59s
My guess is that you're observing the cost of spawning new processes, since apply_async returns immediately. It's much cheaper to spawn one process in the case of thread==1 instead of spawning 70 processes (your last case with the worst decay).
The fact that the server with 80 cores performs better than you laptop with 8 cores could be due to the server containing better hardware in general (better heat removal, faster CPU, etc) or it might contain a different OS. Benchmarking across different machines is non-trivial.

Python GIL with Threadpool

I have a sample code, to demonstrate python GIL, and the relevant output.
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import ThreadPoolExecutor
import time
from itertools import repeat
from time import sleep
values = [3,4,5,6]
def cube(x, y):
print(f'Cube of {x}:{x*x*x}')
c = []
for i in range(0, 100000):
c.append(i)
d = len(c)
e = c.index(i)
return x*x*x
y = {3: 3}
if __name__ == '__main__':
rres = []
ss = time.time()
with ThreadPoolExecutor(max_workers=5) as executor:
rres= executor.map(cube,values, repeat(y))
print(rres)
for ij in rres:
print(ij)
print(f"Time taken is {time.time() - ss}")
And the output is:
Cube of 3:27
Cube of 4:64Cube of 5:125Cube of 6:216
<generator object Executor.map.<locals>.result_iterator at 0x1103b0f20>
27
64
125
216
Time taken is 234.87321090698242
The timing part of the output is expected, due to python GIL, only one thread is using the CPU at a time, but what I fail to understand is that these lines:
Cube of 3:27
Cube of 4:64Cube of 5:125Cube of 6:216
These appeared simultaneously. I epected these to come in intervals of 60 sec, like sequential counterpart of the code. Can anyone explain this part to me? TIA.

ProcessPoolExecutor on shared dataset and multiple arguments

I am facing an issue I was not able to solve by doing some search on the web.
I am using the minimal code below. The goal is to run some function 'f_sum' several million times by multiprocessing (using the ProcessPoolExecutor). I am adding multiple arguments by a list of tuples 'args'. In addition, the function is supposed to use some sort of data which is the same for all executions (in the example it's just one number). I do not want to add the data to the 'args' tuple for memory reasons.
The only option I found so far is adding the data outside of the "if name == 'main'". This will (for some reason that I do not understand) make the variable available to all processes. However, updating is not possible. Also, I do not really want to make the data definition outside because in the actual code it will be based on data import and might require additional manipulation.
Hope you can help and thanks in advance!
PS: I am using Python 3.7.9 on Win 10.
from concurrent.futures import ProcessPoolExecutor
import numpy as np
data = 0 # supposed to be a large data set & shared among all calculations)
num_workers = 6 # number of CPU cores
num_iterations = 10 # supposed to be large number
def f_sum(args):
(x,y) = args
print('This is process', x, 'with exponent:', y)
value = 0
for i in range(10**y):
value += i
return value/10**y + data
def multiprocessing(func, args, workers):
with ProcessPoolExecutor(workers) as executor:
results = executor.map(func, args)
return list(results)
if __name__ == '__main__':
data = 0.5 # try to update data, should not be part of 'args' due to memory
args = []
for k in range(num_iterations):
args.append((k, np.random.randint(1,8)))
result = multiprocessing(f_sum, args, num_workers)
if np.abs(result[0]-np.round(result[0])) > 0:
print('data NOT updated')
Edit to original question:
>> Performance Example 1
from concurrent.futures import ProcessPoolExecutor
import numpy as np
import time
data_size = 10**8
num_workers = 4
num_sum = 10**7
num_iterations = 100
data = np.random.randint(0,100,size=data_size)
# data = np.linspace(0,data_size,data_size+1, dtype=np.uintc)
def f_sum(args):
(x,y) = args
print('This is process', x, 'random number:', y, 'last data', data[-1])
value = 0
for i in range(num_sum):
value += i
result = value - num_sum*(num_sum-1)/2 + data[-1]
return result
def multiprocessing(func, args, workers):
with ProcessPoolExecutor(workers) as executor:
results = executor.map(func, args)
return list(results)
if __name__ == '__main__':
t0 = time.time()
args = []
for k in range(num_iterations):
args.append((k, np.random.randint(1,10)))
result = multiprocessing(f_sum, args, num_workers)
print(f'expected result: {data[-1]}, actual result: {np.unique(result)}')
t1 = time.time()
print(f'total time: {t1-t0}')
>> Output
This is process 99 random number: 6 last data 9
expected result: 86, actual result: [ 3. 9. 29. 58.]
total time: 11.760863542556763
Leads to false result if randint is used. For linspace result is correct.
>> Performance Example 2 - based on proposal in answer
from concurrent.futures import ProcessPoolExecutor
import numpy as np
from multiprocessing import Array
import time
data_size = 10**8
num_workers = 4
num_sum = 10**7
num_iterations = 100
input = np.random.randint(0, 100, size=data_size)
# input = np.linspace(0, data_size, data_size + 1, dtype=np.uintc)
def f_sum(args):
(x,y) = args
print('This is process', x, 'random number:', y, 'last data', data[-1])
value = 0
for i in range(num_sum):
value += i
result = value - num_sum*(num_sum-1)/2 + data[-1]
return result
def init_pool(the_data):
global data
data = the_data
def multiprocessing(func, args, workers, input):
data = Array('i', input, lock=False)
with ProcessPoolExecutor(max_workers=workers, initializer=init_pool, initargs=(data,)) as executor:
results = list(executor.map(func, args))
return results
if __name__ == '__main__':
t0 = time.time()
args = []
for k in range(num_iterations):
args.append((k, np.random.randint(1,10)))
result = multiprocessing(f_sum, args, num_workers, input)
print(f'expected result: {input[-1]}, actual result:{np.unique(result)}')
t1 = time.time()
print(f'total time: {t1-t0}')
>> Output
This is process 99 random number: 7 last data 29
expected result: 29, actual result: [29.]
total time: 30.8266122341156
#Booboo
I added two examples to my original question, the "Performance Example 2" is based on your code. First interesting finding, my original code actually gives incorrect results if the data array is initialized with random integers. I noticed, that each process by itself initializes the data array. Since it is based on random numbers each process uses a different array for calculation, and even different than the main. So that use case would not work with this code, in your code it is correct all the time.
If using linspace, however, it works, since this gives the same result each time. Same would be true for the use case where some data is read from a file (which is my actual use case). Example 1 is still about 3x faster than Example 2, and I think the time is mainly used by the initializing of the array in your method.
Regarding memory usage I don't see a relevant difference in my task manager. Both Example produce a similar increase in memory, even if the shape is different.
I still believe that your method is the correct approach, however, memory usage seems to be similar and speed is slower in the example above.
The most efficient used of memory would be to use shared memory so that all processes are working on the same instance of data. This would be absolutely necessary if the processes updated data. In the example below, since the access to data is read only and I am using a simple array of integers, I am using multiprocessing.Array with no locking specified. The "trick" is to initialize your pool by specifying the initializer and initargs arguments so that each process in the pool has access to this shared memory. I have made a couple of other changes to the code, which I have commented
from concurrent.futures import ProcessPoolExecutor
import numpy as np
from multiprocessing import Array, cpu_count # new imports
def init_pool(the_data):
global data
data = the_data
def f_sum(args):
(x,y) = args
print('This is process', x, 'with exponent:', y)
value = 0
for i in range(10**y):
value += i
return value/10**y + len(data) # just use the length of data for now
def multiprocessing(func, args, workers):
data = Array('i', range(1000), lock=False) # read-only, integers 0, 1, 2, ... 999
with ProcessPoolExecutor(max_workers=workers, initializer=init_pool, initargs=(data,)) as executor:
results = list(executor.map(func, args)) # create the list of results here
print(results) # so that it can be printed out for demo purposes
return results
if __name__ == '__main__':
num_iterations = 10 # supposed to be large number
#num_workers = 6 # number of CPU cores
num_workers = cpu_count() # number of CPU cores
args = []
for k in range(num_iterations):
args.append((k, np.random.randint(1,8)))
result = multiprocessing(f_sum, args, num_workers)
if np.abs(result[0]-np.round(result[0])) > 0:
print('data NOT updated')
Prints:
This is process 0 with exponent: 2
This is process 1 with exponent: 1
This is process 2 with exponent: 4
This is process 3 with exponent: 3
This is process 4 with exponent: 5
This is process 5 with exponent: 1
This is process 6 with exponent: 5
This is process 7 with exponent: 2
This is process 8 with exponent: 6
This is process 9 with exponent: 6
[1049.5, 1004.5, 5999.5, 1499.5, 50999.5, 1004.5, 50999.5, 1049.5, 500999.5, 500999.5]
data NOT updated
Updated Example 2
You saw my comments to your question concerning Example 1.
Your Example 2 is still not ideal: You have the statement input = np.random.randint(0, 100, size=data_size) as a global being needlessly executed by every process as it is initialized for use in the process pool. Below is an updated solution that also shows one way how you can have your worker function work directly with a numpy array that is backed up a multiprocessing.Array instance so that the numpy array exists in shared memory. You don't have to use this technique for what you are doing since you are only using numpy to create random numbers (I an not sure why), but it is a useful technique to know. But you should re-rerun your code after moving the initialization code of input as I have so it is only executed once.
I don't have the occasion to work with numpy day to day but I have come to learn that it uses multiprocessing internally for many of its own functions. So it is often not the best match for use with multiprocessing, although that does not seem to be applicable here since even in the case below we are just indexing an element of an array and it would not be using a sub-process to accomplish that.
from concurrent.futures import ProcessPoolExecutor
import numpy as np
from multiprocessing import Array
import time
import ctypes
data_size = 10**8
num_workers = 4
num_sum = 10**7
num_iterations = 100
# input = np.linspace(0, data_size, data_size + 1, dtype=np.uintc)
def to_shared_array(arr, ctype):
shared_array = Array(ctype, arr.size, lock=False)
temp = np.frombuffer(shared_array, dtype=arr.dtype)
temp[:] = arr.flatten(order='C')
return shared_array
def to_numpy_array(shared_array, shape):
'''Create a numpy array backed by a shared memory Array.'''
arr = np.ctypeslib.as_array(shared_array)
return arr.reshape(shape)
def f_sum(args):
(x,y) = args
print('This is process', x, 'random number:', y, 'last data', data[-1])
value = 0
for i in range(num_sum):
value += i
result = value - num_sum*(num_sum-1)/2 + data[-1]
return result
def init_pool(shared_array, shape):
global data
data = to_numpy_array(shared_array, shape)
def multiprocessing(func, args, workers, input):
input = np.random.randint(0, 100, size=data_size)
shape = input.shape
shared_array = to_shared_array(input, ctypes.c_long)
with ProcessPoolExecutor(max_workers=workers, initializer=init_pool, initargs=(shared_array, shape)) as executor:
results = list(executor.map(func, args))
return input, results
if __name__ == '__main__':
t0 = time.time()
args = []
for k in range(num_iterations):
args.append((k, np.random.randint(1,10)))
input, result = multiprocessing(f_sum, args, num_workers, input)
print(f'expected result: {input[-1]}, actual result:{np.unique(result)}')
t1 = time.time()
print(f'total time: {t1-t0}')

How to retrieve values from a function run in parallel processes?

The Multiprocessing module is quite confusing for python beginners specially for those who have just migrated from MATLAB and are made lazy with its parallel computing toolbox. I have the following function which takes ~80 Secs to run and I want to shorten this time by using Multiprocessing module of Python.
from time import time
xmax = 100000000
start = time()
for x in range(xmax):
y = ((x+5)**2+x-40)
if y <= 0xf+1:
print('Condition met at: ', y, x)
end = time()
tt = end-start #total time
print('Each iteration took: ', tt/xmax)
print('Total time: ', tt)
This outputs as expected:
Condition met at: -15 0
Condition met at: -3 1
Condition met at: 11 2
Each iteration took: 8.667453265190124e-07
Total time: 86.67453265190125
As any iteration of the loop is not dependent on others, I tried to adopt this Server Process from the official documentation to scan chunks of the range in separate processes. And finally I came up with vartec's answer to this question and could prepare the following code. I also updated the code based on Darkonaut's response to the current question.
from time import time
import multiprocessing as mp
def chunker (rng, t): # this functions makes t chunks out of rng
L = rng[1] - rng[0]
Lr = L % t
Lm = L // t
h = rng[0]-1
chunks = []
for i in range(0, t):
c = [h+1, h + Lm]
h += Lm
chunks.append(c)
chunks[t-1][1] += Lr + 1
return chunks
def worker(lock, xrange, return_dict):
'''worker function'''
for x in range(xrange[0], xrange[1]):
y = ((x+5)**2+x-40)
if y <= 0xf+1:
print('Condition met at: ', y, x)
return_dict['x'].append(x)
return_dict['y'].append(y)
with lock:
list_x = return_dict['x']
list_y = return_dict['y']
list_x.append(x)
list_y.append(y)
return_dict['x'] = list_x
return_dict['y'] = list_y
if __name__ == '__main__':
start = time()
manager = mp.Manager()
return_dict = manager.dict()
lock = manager.Lock()
return_dict['x']=manager.list()
return_dict['y']=manager.list()
xmax = 100000000
nw = mp.cpu_count()
workers = list(range(0, nw))
chunks = chunker([0, xmax], nw)
jobs = []
for i in workers:
p = mp.Process(target=worker, args=(lock, chunks[i],return_dict))
jobs.append(p)
p.start()
for proc in jobs:
proc.join()
end = time()
tt = end-start #total time
print('Each iteration took: ', tt/xmax)
print('Total time: ', tt)
print(return_dict['x'])
print(return_dict['y'])
which considerably reduces the run time to ~17 Secs. But, my shared variable cannot retrieve any values. Please help me find out which part of the code is going wrong.
the output I get is:
Each iteration took: 1.7742713451385497e-07
Total time: 17.742713451385498
[]
[]
from which I expect:
Each iteration took: 1.7742713451385497e-07
Total time: 17.742713451385498
[0, 1, 2]
[-15, -3, 11]
The issue in your example is that modifications to standard mutable structures within Manager.dict will not be propagated. I'm first showing you how to fix it with manager, just to show you better options afterwards.
multiprocessing.Manager is a bit heavy since it uses a separate Process just for the Manager and working on a shared object needs using locks for data consistency. If you run this on one machine, there are better options with multiprocessing.Pool, in case you don't have to run customized Process classes and if you have to, multiprocessing.Process together with multiprocessing.Queue would be the common way of doing it.
The quoting parts are from the multiprocessing docs.
Manager
If standard (non-proxy) list or dict objects are contained in a referent, modifications to those mutable values will not be propagated through the manager because the proxy has no way of knowing when the values contained within are modified. However, storing a value in a container proxy (which triggers a setitem on the proxy object) does propagate through the manager and so to effectively modify such an item, one could re-assign the modified value to the container proxy...
In your case this would look like:
def worker(xrange, return_dict, lock):
"""worker function"""
for x in range(xrange[0], xrange[1]):
y = ((x+5)**2+x-40)
if y <= 0xf+1:
print('Condition met at: ', y, x)
with lock:
list_x = return_dict['x']
list_y = return_dict['y']
list_x.append(x)
list_y.append(y)
return_dict['x'] = list_x
return_dict['y'] = list_y
The lock here would be a manager.Lock instance you have to pass along as argument since the whole (now) locked operation is not by itself atomic. (Here
is an easier example with Manager using Lock)
This approach is perhaps less convenient than employing nested Proxy Objects for most use cases but also demonstrates a level of control over the synchronization.
Since Python 3.6 proxy objects are nestable:
Changed in version 3.6: Shared objects are capable of being nested. For example, a shared container object such as a shared list can contain other shared objects which will all be managed and synchronized by the SyncManager.
Since Python 3.6 you can fill your manager.dict before starting multiprocessing with manager.list as values and then append directly in the worker without having to reassign.
return_dict['x'] = manager.list()
return_dict['y'] = manager.list()
EDIT:
Here is the full example with Manager:
import time
import multiprocessing as mp
from multiprocessing import Manager, Process
from contextlib import contextmanager
# mp_util.py from first link in code-snippet for "Pool"
# section below
from mp_utils import calc_batch_sizes, build_batch_ranges
# def context_timer ... see code snippet in "Pool" section below
def worker(batch_range, return_dict, lock):
"""worker function"""
for x in batch_range:
y = ((x+5)**2+x-40)
if y <= 0xf+1:
print('Condition met at: ', y, x)
with lock:
return_dict['x'].append(x)
return_dict['y'].append(y)
if __name__ == '__main__':
N_WORKERS = mp.cpu_count()
X_MAX = 100000000
batch_sizes = calc_batch_sizes(X_MAX, n_workers=N_WORKERS)
batch_ranges = build_batch_ranges(batch_sizes)
print(batch_ranges)
with Manager() as manager:
lock = manager.Lock()
return_dict = manager.dict()
return_dict['x'] = manager.list()
return_dict['y'] = manager.list()
tasks = [(batch_range, return_dict, lock)
for batch_range in batch_ranges]
with context_timer():
pool = [Process(target=worker, args=args)
for args in tasks]
for p in pool:
p.start()
for p in pool:
p.join()
# Create standard container with data from manager before exiting
# the manager.
result = {k: list(v) for k, v in return_dict.items()}
print(result)
Pool
Most often a multiprocessing.Pool will just do it. You have an additional challenge in your example since you want to distribute iteration over a range.
Your chunker function doesn't manage to divide the range even so every process has about the same work to do:
chunker((0, 21), 4)
# Out: [[0, 4], [5, 9], [10, 14], [15, 21]] # 4, 4, 4, 6!
For the code below please grab the code snippet for mp_utils.py from my answer here, it provides two functions to chunk ranges as even as possible.
With multiprocessing.Pool your worker function just has to return the result and Pool will take care of transporting the result back over internal queues back to the parent process. The result will be a list, so you will have to rearange your result again in a way you want it to have. Your example could then look like this:
import time
import multiprocessing as mp
from multiprocessing import Pool
from contextlib import contextmanager
from itertools import chain
from mp_utils import calc_batch_sizes, build_batch_ranges
#contextmanager
def context_timer():
start_time = time.perf_counter()
yield
end_time = time.perf_counter()
total_time = end_time-start_time
print(f'\nEach iteration took: {total_time / X_MAX:.4f} s')
print(f'Total time: {total_time:.4f} s\n')
def worker(batch_range):
"""worker function"""
result = []
for x in batch_range:
y = ((x+5)**2+x-40)
if y <= 0xf+1:
print('Condition met at: ', y, x)
result.append((x, y))
return result
if __name__ == '__main__':
N_WORKERS = mp.cpu_count()
X_MAX = 100000000
batch_sizes = calc_batch_sizes(X_MAX, n_workers=N_WORKERS)
batch_ranges = build_batch_ranges(batch_sizes)
print(batch_ranges)
with context_timer():
with Pool(N_WORKERS) as pool:
results = pool.map(worker, iterable=batch_ranges)
print(f'results: {results}')
x, y = zip(*chain.from_iterable(results)) # filter and sort results
print(f'results sorted: x: {x}, y: {y}')
Example Output:
[range(0, 12500000), range(12500000, 25000000), range(25000000, 37500000),
range(37500000, 50000000), range(50000000, 62500000), range(62500000, 75000000), range(75000000, 87500000), range(87500000, 100000000)]
Condition met at: -15 0
Condition met at: -3 1
Condition met at: 11 2
Each iteration took: 0.0000 s
Total time: 8.2408 s
results: [[(0, -15), (1, -3), (2, 11)], [], [], [], [], [], [], []]
results sorted: x: (0, 1, 2), y: (-15, -3, 11)
Process finished with exit code 0
If you had multiple arguments for your worker you would build a "tasks"-list with argument-tuples and exchange pool.map(...) with pool.starmap(...iterable=tasks). See docs for further details on that.
Process & Queue
If you can't use multiprocessing.Pool for some reason, you have to take
care of inter-process communication (IPC) yourself, by passing a
multiprocessing.Queue as argument to your worker-functions in the child-
processes and letting them enqueue their results to be send back to the
parent.
You will also have to build your Pool-like structure so you can iterate over it to start and join the processes and you have to get() the results back from the queue. More about Queue.get usage I've written up here.
A solution with this approach could look like this:
def worker(result_queue, batch_range):
"""worker function"""
result = []
for x in batch_range:
y = ((x+5)**2+x-40)
if y <= 0xf+1:
print('Condition met at: ', y, x)
result.append((x, y))
result_queue.put(result) # <--
if __name__ == '__main__':
N_WORKERS = mp.cpu_count()
X_MAX = 100000000
result_queue = mp.Queue() # <--
batch_sizes = calc_batch_sizes(X_MAX, n_workers=N_WORKERS)
batch_ranges = build_batch_ranges(batch_sizes)
print(batch_ranges)
with context_timer():
pool = [Process(target=worker, args=(result_queue, batch_range))
for batch_range in batch_ranges]
for p in pool:
p.start()
results = [result_queue.get() for _ in batch_ranges]
for p in pool:
p.join()
print(f'results: {results}')
x, y = zip(*chain.from_iterable(results)) # filter and sort results
print(f'results sorted: x: {x}, y: {y}')

Parallelization/multiprocessing of conditional for loop

I want to use multiprocessing in Python to speed up a while loop.
More specifically:
I have a matrix (samples*features). I want to select x subsets of samples whose values at a random subset of features is unequal to a certain value (-1 in this case).
My serial code:
np.random.seed(43)
datafile = '...'
df = pd.read_csv(datafile, sep=" ", nrows = 89)
no_feat = 500
no_samp = 5
no_trees = 5
i=0
iter=0
samples = np.zeros((no_trees, no_samp))
features = np.zeros((no_trees, no_feat))
while i < no_trees:
rand_feat = np.random.choice(df.shape[1], no_feat, replace=False)
iter_order = np.random.choice(df.shape[0], df.shape[0], replace=False)
samp_idx = []
a=0
#--------------
#how to run in parallel?
for j in iter_order:
pot_samp = df.iloc[j, rand_feat]
if len(np.where(pot_samp==-1)[0]) == 0:
samp_idx.append(j)
if len(samp_idx) == no_samp:
print a
break
a+=1
#--------------
if len(samp_idx) == no_samp:
samples[i,:] = samp_idx
features[i, :] = rand_feat
i+=1
iter+=1
if iter>1000: #break if subsets cannot be found
break
Searching for fitting samples is the potentially expensive part (the j for loop), which in theory can be run in parallel. In some cases, it is not necessary to iterate over all samples to find a large enough subset, which is why I am breaking out of the loop as soon as the subset is large enough.
I am struggling to find an implementation that would allow for checks of how many valid results are generated already. Is it even possible?
I have used joblib before. If I understand correctly this uses the pool methods of multiprocessing as a backend which only works for separate tasks? I am thinking that queues might be helpful but thus far I failed at implementing them.
I found a working solution. I decided to run the while loop in parallel and have the different processes interact over a shared counter. Furthermore, I vectorized the search for suitable samples.
The vectorization yielded a ~300x speedup and running on 4 cores speeds up the computation ~twofold.
First I tried to implement separate processes and put the results into a queue. Turns out these aren't made to store large amounts of data.
If someone sees another bottleneck in that code I would be glad if someone pointed it out.
With my basically nonexistent knowledge about parallel computing I found it really hard to puzzle this together, especially since the example on the internet are all very basic. I learnt a lot though =)
My code:
import numpy as np
import pandas as pd
import itertools
from multiprocessing import Pool, Lock, Value
from datetime import datetime
import settings
val = Value('i', 0)
worker_ID = Value('i', 1)
lock = Lock()
def findSamp(no_trees, df, no_feat, no_samp):
lock.acquire()
print 'starting worker - {0}'.format(worker_ID.value)
worker_ID.value +=1
worker_ID_local = worker_ID.value
lock.release()
max_iter = 100000
samp = []
feat = []
iter_outer = 0
iter = 0
while val.value < no_trees and iter_outer<max_iter:
rand_feat = np.random.choice(df.shape[1], no_feat, replace=False
#get samples with random features from dataset;
#find and select samples that don't have missing values in the random features
samp_rand = df.iloc[:,rand_feat]
nan_idx = np.unique(np.where(samp_rand == -1)[0])
all_idx = np.arange(df.shape[0])
notnan_bool = np.invert(np.in1d(all_idx, nan_idx))
notnan_idx = np.where(notnan_bool == True)[0]
if notnan_idx.shape[0] >= no_samp:
#if enough samples for random feature subset, select no_samp samples randomly
notnan_idx_rand = np.random.choice(notnan_idx, no_samp, replace=False)
rand_feat_rand = rand_feat
lock.acquire()
val.value += 1
#x = val.value
lock.release()
#print 'no of trees generated: {0}'.format(x)
samp.append(notnan_idx_rand)
feat.append(rand_feat_rand)
else:
#increase iter_outer counter if no sample subset could be found for random feature subset
iter_outer += 1
iter+=1
if iter >= max_iter:
print 'exiting worker{0} because iter >= max_iter'.format(worker_ID_local)
else:
print 'worker{0} - finished'.format(worker_ID_local)
return samp, feat
def initialize(*args):
global val, worker_ID, lock
val, worker_ID, lock = args
def star_findSamp(i_df_no_feat_no_samp):
return findSamp(*i_df_no_feat_no_samp)
if __name__ == '__main__':
np.random.seed(43)
datafile = '...'
df = pd.read_csv(datafile, sep=" ", nrows = 89)
df = df.fillna(-1)
df = df.iloc[:, 6:]
no_feat = 700
no_samp = 10
no_trees = 5000
startTime = datetime.now()
print 'starting multiprocessing'
ncores = 4
p = Pool(ncores, initializer=initialize, initargs=(val, worker_ID, lock))
args = itertools.izip([no_trees]*ncores, itertools.repeat(df), itertools.repeat(no_feat), itertools.repeat(no_samp))
result = p.map(star_findSamp, args)#, callback=log_result)
p.close()
p.join()
print '{0} sample subsets for tree training have been found'.format(val.value)
samples = [x[0] for x in result if x != None]
samples = np.vstack(samples)
features = [x[1] for x in result if x != None]
features = np.vstack(features)
print datetime.now() - startTime

Categories