Related
from concurrent.futures import ThreadPoolExecutor, as_completed
def add_one(number, n):
return number + 1 + n
def process():
all_numbers = []
for i in range(0, 10):
all_numbers.append(i)
threads = []
all_results = []
with ThreadPoolExecutor(max_workers=10) as executor:
for number in all_numbers:
threads.append(executor.submit(add_one, number))
for index, task in enumerate(as_completed(threads)):
result = task.result()
#print(result)
all_results.append(result)
for index, result in enumerate(all_results):
print(result)
process()
If I set max_works=1, it will print out from 1 to 10 in order; if I set max_workers = 10, the order could be random:
5
3
10
7
1
8
6
2
4
9
How to keep the original order of the input when using ThreadPoolExecutor to process a list of items as in this example?
You can use the map method of the ThreadExecutor:
from concurrent.futures import ThreadPoolExecutor, as_completed
def add_one(number):
return number + 1
def process():
all_numbers = []
for i in range(0, 10):
all_numbers.append(i)
all_results = []
with ThreadPoolExecutor(max_workers=10) as executor:
for i in executor.map(add_one, all_numbers):
print(i)
all_results.append(i)
for index, result in enumerate(all_results):
print(result)
process()
Updated answer based on comments requirements:
from concurrent.futures import ThreadPoolExecutor, as_completed
def add_one(args):
return args[0] + 1 + args[1]
def process():
all_numbers = []
for i in range(0, 10):
all_numbers.append([i, 2])
all_results = []
with ThreadPoolExecutor(max_workers=10) as executor:
for i in executor.map(add_one, all_numbers):
print(i)
all_results.append(i)
for index, result in enumerate(all_results):
print(result)
process()
This mixes two incompatible ideas!
When you use a thread/process/whatever pool, the work will be done in an arbitrary order (largely the result of unrelated system load). Some work may happen at the same time as other work (which is normally the benefit of such a system; parallelization). However, unless you go out of your way to order the results, they will be in whatever order the pool did the work in.
Rather than attempting to "order" the results, consider mapping them back to some collection, like a dictionary, so you can read them back by-key (which may have some order to it).
Just remove as_completed from your code!
executor.submit will return Future object, and you append it to list in order.
But as_completed will generate Future objects in completed order.
from concurrent.futures import ThreadPoolExecutor, as_completed
def add_one(number, n):
return number + 1 + n
def process():
all_numbers = []
for i in range(0, 10):
all_numbers.append(i)
threads = []
all_results = []
with ThreadPoolExecutor(max_workers=10) as executor:
for number in all_numbers:
threads.append(executor.submit(add_one, number))
for index, task in enumerate(threads):
result = task.result()
#print(result)
all_results.append(result)
for index, result in enumerate(all_results):
print(result)
process()
Per #marlon's request, here is a variation of the #rorra solution that uses itertools.repeat to reduce some of the parameter passing complexity.
Example:
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import itertools
def add_one(number, n):
return number + 1 + n
def process():
all_numbers = list(range(0, 10))
with ThreadPoolExecutor(max_workers=10) as executor:
for result in executor.map(add_one, all_numbers, itertools.repeat(2)):
print(result)
process()
Output:
3
4
5
6
7
8
9
10
11
12
This can be one of the ways to get the desired result
from concurrent.futures import ThreadPoolExecutor, as_completed
def add_one(number, index):
return number + 1, index
def process():
all_numbers = []
for i in range(0, 10):
all_numbers.append(i)
threads = []
all_results = []
with ThreadPoolExecutor(max_workers=10) as executor:
for index, number in enumerate(all_numbers):
threads.append(executor.submit(add_one, number, index))
for task in as_completed(threads):
result, index = task.result()
all_results.append([result, index])
all_results = sorted(all_results, key=lambda x: x[-1])
for index, result in enumerate(all_results):
print(result[0])
process()
I know that this question was posted a some time ago, but I had a similar problem and I could sovle it creating a dict using the result of the method id() as key, and the chunk processed for the future function (in my case if one fails I need to try process this chunk one more time). Ex:
control_dict = {}
future = executor.submit(my_func, chunk))
control_dict[id(future)] = chunk
You could use the same strategy to "find" the order.
I have the following problem. The code below successfully linear fits may data from 50 to 400 samples (I never have more than 400 samples and the first 50 are of horrendous quality). In the third dimension I will have the value of 7 and the fourth dimension can have values of up to 10000 therefore this loop "solution" would take alot of time. How can I not use a for loop and decrease my runtimes? Thank you for your help (I am pretty new to Python)
from sklearn.linear_model import TheilSenRegressor
import numpy as np
#ransac = linear_model.RANSACRegressor()
skip_v=50#number of values to be skipped
N=400
test_n=np.reshape(range(skip_v, N),(-1,1))
f_n=7
d4=np.shape(data)
a6=np.ones((f_n,d4[3]))
b6=np.ones((f_n,d4[3]))
for j in np.arange(d4[3]):
for i in np.arange(f_n):
theil = TheilSenRegressor(random_state=0).fit(test_n,np.log(data[skip_v:,3,i,j]))
a6[i,j]=theil.coef_
b6[i,j]=theil.intercept_
You can use multiprocessing to work your loop in parallel. The following code is not working. It just demonstrates how to do it. It is only useful, if your numbers are really big. Otherwise, doing in sequential is faster.
from sklearn.linear_model import TheilSenRegressor
import numpy as np
import multiprocessing as mp
from itertools import product
def worker_function(input_queue, output_queue, skip_v, test_n, data):
for task in iter(input_queue.get, 'STOP'):
i = task[0]
j = task[1]
theil = TheilSenRegressor(random_state=0).fit(test_n,np.log(data[skip_v:,3,i,j]))
output_queue.put([i, j, theil])
if __name__ == "__main__":
# define data here
f_n = 7
d4 = np.shape(data)
skip_v = 50
N=400
test_n=np.reshape(range(skip_v, N),(-1,1))
input_queue = mp.Queue()
output_queue = mp.Queue()
# here you create all combinations of j and i of your loop
list1 = range(f_n)
list2 = range(d4[3])
list3 = [list1, list2]
tasks = [p for p in product(*list3)]
numProc = 4
# start processes
process = [mp.Process(target=worker_function,
args=(input_queue, output_queue,
skip_v, test_n, data)) for x in range(numProc)]
for p in process:
p.start()
# queue tasks
for i in tasks:
input_queue.put(i)
# signal workers to stop after tasks are all done
for i in range(numProc):
input_queue.put('STOP')
# get the results
for i in range(len(tasks)):
res = output_queue.get(block=True) # wait for results
a6[res[0], res[1]] = res[2].coef_
b6[res[0], res[1]] = res[2].intercept_
I am facing an issue I was not able to solve by doing some search on the web.
I am using the minimal code below. The goal is to run some function 'f_sum' several million times by multiprocessing (using the ProcessPoolExecutor). I am adding multiple arguments by a list of tuples 'args'. In addition, the function is supposed to use some sort of data which is the same for all executions (in the example it's just one number). I do not want to add the data to the 'args' tuple for memory reasons.
The only option I found so far is adding the data outside of the "if name == 'main'". This will (for some reason that I do not understand) make the variable available to all processes. However, updating is not possible. Also, I do not really want to make the data definition outside because in the actual code it will be based on data import and might require additional manipulation.
Hope you can help and thanks in advance!
PS: I am using Python 3.7.9 on Win 10.
from concurrent.futures import ProcessPoolExecutor
import numpy as np
data = 0 # supposed to be a large data set & shared among all calculations)
num_workers = 6 # number of CPU cores
num_iterations = 10 # supposed to be large number
def f_sum(args):
(x,y) = args
print('This is process', x, 'with exponent:', y)
value = 0
for i in range(10**y):
value += i
return value/10**y + data
def multiprocessing(func, args, workers):
with ProcessPoolExecutor(workers) as executor:
results = executor.map(func, args)
return list(results)
if __name__ == '__main__':
data = 0.5 # try to update data, should not be part of 'args' due to memory
args = []
for k in range(num_iterations):
args.append((k, np.random.randint(1,8)))
result = multiprocessing(f_sum, args, num_workers)
if np.abs(result[0]-np.round(result[0])) > 0:
print('data NOT updated')
Edit to original question:
>> Performance Example 1
from concurrent.futures import ProcessPoolExecutor
import numpy as np
import time
data_size = 10**8
num_workers = 4
num_sum = 10**7
num_iterations = 100
data = np.random.randint(0,100,size=data_size)
# data = np.linspace(0,data_size,data_size+1, dtype=np.uintc)
def f_sum(args):
(x,y) = args
print('This is process', x, 'random number:', y, 'last data', data[-1])
value = 0
for i in range(num_sum):
value += i
result = value - num_sum*(num_sum-1)/2 + data[-1]
return result
def multiprocessing(func, args, workers):
with ProcessPoolExecutor(workers) as executor:
results = executor.map(func, args)
return list(results)
if __name__ == '__main__':
t0 = time.time()
args = []
for k in range(num_iterations):
args.append((k, np.random.randint(1,10)))
result = multiprocessing(f_sum, args, num_workers)
print(f'expected result: {data[-1]}, actual result: {np.unique(result)}')
t1 = time.time()
print(f'total time: {t1-t0}')
>> Output
This is process 99 random number: 6 last data 9
expected result: 86, actual result: [ 3. 9. 29. 58.]
total time: 11.760863542556763
Leads to false result if randint is used. For linspace result is correct.
>> Performance Example 2 - based on proposal in answer
from concurrent.futures import ProcessPoolExecutor
import numpy as np
from multiprocessing import Array
import time
data_size = 10**8
num_workers = 4
num_sum = 10**7
num_iterations = 100
input = np.random.randint(0, 100, size=data_size)
# input = np.linspace(0, data_size, data_size + 1, dtype=np.uintc)
def f_sum(args):
(x,y) = args
print('This is process', x, 'random number:', y, 'last data', data[-1])
value = 0
for i in range(num_sum):
value += i
result = value - num_sum*(num_sum-1)/2 + data[-1]
return result
def init_pool(the_data):
global data
data = the_data
def multiprocessing(func, args, workers, input):
data = Array('i', input, lock=False)
with ProcessPoolExecutor(max_workers=workers, initializer=init_pool, initargs=(data,)) as executor:
results = list(executor.map(func, args))
return results
if __name__ == '__main__':
t0 = time.time()
args = []
for k in range(num_iterations):
args.append((k, np.random.randint(1,10)))
result = multiprocessing(f_sum, args, num_workers, input)
print(f'expected result: {input[-1]}, actual result:{np.unique(result)}')
t1 = time.time()
print(f'total time: {t1-t0}')
>> Output
This is process 99 random number: 7 last data 29
expected result: 29, actual result: [29.]
total time: 30.8266122341156
#Booboo
I added two examples to my original question, the "Performance Example 2" is based on your code. First interesting finding, my original code actually gives incorrect results if the data array is initialized with random integers. I noticed, that each process by itself initializes the data array. Since it is based on random numbers each process uses a different array for calculation, and even different than the main. So that use case would not work with this code, in your code it is correct all the time.
If using linspace, however, it works, since this gives the same result each time. Same would be true for the use case where some data is read from a file (which is my actual use case). Example 1 is still about 3x faster than Example 2, and I think the time is mainly used by the initializing of the array in your method.
Regarding memory usage I don't see a relevant difference in my task manager. Both Example produce a similar increase in memory, even if the shape is different.
I still believe that your method is the correct approach, however, memory usage seems to be similar and speed is slower in the example above.
The most efficient used of memory would be to use shared memory so that all processes are working on the same instance of data. This would be absolutely necessary if the processes updated data. In the example below, since the access to data is read only and I am using a simple array of integers, I am using multiprocessing.Array with no locking specified. The "trick" is to initialize your pool by specifying the initializer and initargs arguments so that each process in the pool has access to this shared memory. I have made a couple of other changes to the code, which I have commented
from concurrent.futures import ProcessPoolExecutor
import numpy as np
from multiprocessing import Array, cpu_count # new imports
def init_pool(the_data):
global data
data = the_data
def f_sum(args):
(x,y) = args
print('This is process', x, 'with exponent:', y)
value = 0
for i in range(10**y):
value += i
return value/10**y + len(data) # just use the length of data for now
def multiprocessing(func, args, workers):
data = Array('i', range(1000), lock=False) # read-only, integers 0, 1, 2, ... 999
with ProcessPoolExecutor(max_workers=workers, initializer=init_pool, initargs=(data,)) as executor:
results = list(executor.map(func, args)) # create the list of results here
print(results) # so that it can be printed out for demo purposes
return results
if __name__ == '__main__':
num_iterations = 10 # supposed to be large number
#num_workers = 6 # number of CPU cores
num_workers = cpu_count() # number of CPU cores
args = []
for k in range(num_iterations):
args.append((k, np.random.randint(1,8)))
result = multiprocessing(f_sum, args, num_workers)
if np.abs(result[0]-np.round(result[0])) > 0:
print('data NOT updated')
Prints:
This is process 0 with exponent: 2
This is process 1 with exponent: 1
This is process 2 with exponent: 4
This is process 3 with exponent: 3
This is process 4 with exponent: 5
This is process 5 with exponent: 1
This is process 6 with exponent: 5
This is process 7 with exponent: 2
This is process 8 with exponent: 6
This is process 9 with exponent: 6
[1049.5, 1004.5, 5999.5, 1499.5, 50999.5, 1004.5, 50999.5, 1049.5, 500999.5, 500999.5]
data NOT updated
Updated Example 2
You saw my comments to your question concerning Example 1.
Your Example 2 is still not ideal: You have the statement input = np.random.randint(0, 100, size=data_size) as a global being needlessly executed by every process as it is initialized for use in the process pool. Below is an updated solution that also shows one way how you can have your worker function work directly with a numpy array that is backed up a multiprocessing.Array instance so that the numpy array exists in shared memory. You don't have to use this technique for what you are doing since you are only using numpy to create random numbers (I an not sure why), but it is a useful technique to know. But you should re-rerun your code after moving the initialization code of input as I have so it is only executed once.
I don't have the occasion to work with numpy day to day but I have come to learn that it uses multiprocessing internally for many of its own functions. So it is often not the best match for use with multiprocessing, although that does not seem to be applicable here since even in the case below we are just indexing an element of an array and it would not be using a sub-process to accomplish that.
from concurrent.futures import ProcessPoolExecutor
import numpy as np
from multiprocessing import Array
import time
import ctypes
data_size = 10**8
num_workers = 4
num_sum = 10**7
num_iterations = 100
# input = np.linspace(0, data_size, data_size + 1, dtype=np.uintc)
def to_shared_array(arr, ctype):
shared_array = Array(ctype, arr.size, lock=False)
temp = np.frombuffer(shared_array, dtype=arr.dtype)
temp[:] = arr.flatten(order='C')
return shared_array
def to_numpy_array(shared_array, shape):
'''Create a numpy array backed by a shared memory Array.'''
arr = np.ctypeslib.as_array(shared_array)
return arr.reshape(shape)
def f_sum(args):
(x,y) = args
print('This is process', x, 'random number:', y, 'last data', data[-1])
value = 0
for i in range(num_sum):
value += i
result = value - num_sum*(num_sum-1)/2 + data[-1]
return result
def init_pool(shared_array, shape):
global data
data = to_numpy_array(shared_array, shape)
def multiprocessing(func, args, workers, input):
input = np.random.randint(0, 100, size=data_size)
shape = input.shape
shared_array = to_shared_array(input, ctypes.c_long)
with ProcessPoolExecutor(max_workers=workers, initializer=init_pool, initargs=(shared_array, shape)) as executor:
results = list(executor.map(func, args))
return input, results
if __name__ == '__main__':
t0 = time.time()
args = []
for k in range(num_iterations):
args.append((k, np.random.randint(1,10)))
input, result = multiprocessing(f_sum, args, num_workers, input)
print(f'expected result: {input[-1]}, actual result:{np.unique(result)}')
t1 = time.time()
print(f'total time: {t1-t0}')
I read a lot of posts about parallelization using the multiprocessing module but none of them quite answered my question.
I have a very long generator giving me parameter values and for each I want to compute some function value. However, I only want to save the best n many, since I am only interested in the best ones and saving all of the results would blow up the RAM.
There way I see it, there are two ways to do this: 1) use a common shared memory between the processes where the best values are saved or 2) keep separate lists of the best results for each core/process and later manually merge these lists together.
I think the second method would be better, however I am not sure how to implement this.
This is what I got so far:
import numpy as np
import multiprocessing
from functools import partial
def get_generator(length: int):
for i in range(length):
yield [i, i + 1]
def some_func(x, other_stuff):
y = np.sum(x)
return y
def task(other_stuff, x: np.ndarray):
val = some_func(x, other_stuff)
if val > task.some_dict['min']:
task.l.append(val)
task.some_dict['min'] = val
return
def task_init(l, some_dict):
task.l = l
task.some_dict = some_dict
task.some_dict['min'] = np.NINF
n = 20
generator = get_generator(n)
other_stuff = np.nan
func = partial(task, other_stuff)
l = multiprocessing.Manager().list()
some_dict = multiprocessing.Manager().dict()
p = multiprocessing.Pool(None, task_init, [l, some_dict])
p.imap(func, generator, chunksize=10000)
p.close()
p.join()
This would be somewhat similar to what I want to do. But I really care about performance and in the actual code the comparison/saving of the best values will be more complex so I think that the shared memory approach would be really slow.
My question boils down to:
If I have e.g. 8 cores, how could I have 8 lists of the best results each for one core that will be returned, so that the cores work completely independent and rather quick?
Thank you very much!
These are my comments put into action. I hope your actual task is a more complicated computation or it would be hardly worth using multiprocessing.
import numpy as np
import multiprocessing
from functools import partial
from heapq import *
def get_generator(length: int):
for i in range(length):
yield [i, i + 1]
def some_func(x, other_stuff):
y = np.sum(x)
return y
def task(other_stuff, x: np.ndarray):
val = some_func(x, other_stuff)
return val
def main():
n = 20
generator = get_generator(n)
other_stuff = np.nan
func = partial(task, other_stuff)
cpu_count = multiprocessing.cpu_count() - 1 # leave a processor for the main process
chunk_size = n // cpu_count
HEAPSIZE = 8
with multiprocessing.Pool(cpu_count) as pool:
heap = []
for val in pool.imap_unordered(func, generator, chunksize=chunk_size):
if len(heap) < HEAPSIZE:
heappush(heap, val)
elif val > heap[0]:
heappushpop(heap, val)
# sort
values = sorted(heap, reverse=True)
print(values)
if __name__ == '__main__':
main()
Prints:
[39, 37, 35, 33, 31, 29, 27, 25]
Update
I found it best with the following experiment to allocate to the pool a number of processes equal to mp.cpu_count() - 1 to leave the main process a free proceesor to handle the results returned by the workers. I also experimented with the chunksize parameter:
import multiprocessing as mp
import timeit
def worker_process(i):
s = 0
for n in range(10000):
s += i * i # square the argument
s /= 10000
return s
def main():
cpu_count = mp.cpu_count() - 1 # leave a processor for the main process
N = 10000
chunk_size = N // cpu_count # 100 may be good enough
results = []
with mp.Pool(cpu_count) as pool:
for result in pool.imap_unordered(worker_process, range(N), chunksize=chunk_size):
results.append(result)
#print(results[0:10])
if __name__ == '__main__':
print(timeit.timeit(stmt='main()', number=10, globals=globals()) / 10)
On my desktop (running other processes, such as streaming music), the above code did better with assigning mp.cpu_count() - 1 to cpu_count (2.4 seconds vs, 2.5 seconds). Here are other timings (rounded to one decimal place):
chunksize = 1428 -> 2.4 seconds (N // (mp.cpu_count() - 1)
chunksize = 1000 -> 2.7 seconds
chunksize = 100 -> 2.4 seconds
chunksize = 10 -> 2.4 seconds
chunksize = 1 -> 2.6 seconds
The result for a chunksize value of 1000 is a bit of an anomaly. I would suggest trying different values, otherwise N // (mp.cpu_count() - 1). This is assuming you can compute N, the number of items in the iterable. When you have a generator as the iterable, you would have to, in the general case, convert it first to a list, to be able to get its length. Even a chunksize value of 1 in this particular benchmark did not do that much worse. But this is what I have learned from varying the amount of work worker_process has to do:
The more work (i.e. CPU) your worker process has to do to complete its task, the less sensitive it is to the chunksize parameter. If it returns after using very little CPU, then the overhead of transferring the next chunk becomes significant and you want to keep the number of chunk transfers to a small value (i.e. you want a large chunksize value). But if the process is long running, the overhead of transferring the next chunk will not be as impactful.
In the following code the worker process's CPU requirements are trivial:
import multiprocessing as mp
import timeit
def worker_process(i):
return i ** 2
def main():
cpu_count = mp.cpu_count() - 1
N = 100000
chunk_size = N // cpu_count
results = []
with mp.Pool(cpu_count) as pool:
for result in pool.imap_unordered(worker_process, range(N), chunksize=chunk_size):
results.append(result)
print(results[0:10])
if __name__ == '__main__':
print(timeit.timeit(stmt='main()', number=10, globals=globals()) / 10)
The timings:
chunksize = 1428 -> .19 seconds
chunksize = 100 -> .39 seconds
chunksize = 1 -> 11.06 seconds
In the following code the worker process's CPU requirements are more substantial:
import multiprocessing as mp
import timeit
def worker_process(i):
s = 0
for _ in range(1000000):
s += i * i
return s // 1000000
def main():
cpu_count = mp.cpu_count() - 1
N = 1000
chunk_size = N // cpu_count
results = []
with mp.Pool(cpu_count) as pool:
for result in pool.imap_unordered(worker_process, range(N), chunksize=chunk_size):
results.append(result)
print(results[0:10])
if __name__ == '__main__':
print(timeit.timeit(stmt='main()', number=3, globals=globals()) / 3)
The timings:
chunksize = 142 -> 22.6 seconds (N // (mp.cpu_count() - 1)
chunksize = 10 -> 23.5 seconds
chunksize = 1 -> 23.2 seconds
Update 2
According to Python multiprocessing: understanding logic behind chunksize, when methods map, starmap or map_async are called with chunksize=None there is a specific algorithm used to compute a chunksize, which I have used in the code below. I don't know why the default value for methods imap and imap_unordered is 1 and does not use this same algorithm. Perhaps because that wouldn't be "lazy" as implied by the description of these methods. In the following code, which repeats the previous benchmark, I use a redefinition of the same algorithm for computing the default chunksize:
import multiprocessing as mp
import timeit
def worker_process(i):
s = 0
for _ in range(1000000):
s += i * i
return s // 1000000
def compute_chunksize(pool_size, iterable_size):
if iterable_size == 0:
return 0
chunksize, extra = divmod(iterable_size, pool_size * 4)
if extra:
chunksize += 1
return chunksize
def main():
cpu_count = mp.cpu_count() - 1
N = 1000
chunk_size = compute_chunksize(cpu_count, N)
print('chunk_size =', chunk_size)
results = []
with mp.Pool(cpu_count) as pool:
for result in pool.imap_unordered(worker_process, range(N), chunksize=chunk_size):
results.append(result)
print(results[0:10])
if __name__ == '__main__':
print(timeit.timeit(stmt='main()', number=3, globals=globals()) / 3)
Timings:
chunksize 36 -> 22.2 seconds
I want to use multiprocessing in Python to speed up a while loop.
More specifically:
I have a matrix (samples*features). I want to select x subsets of samples whose values at a random subset of features is unequal to a certain value (-1 in this case).
My serial code:
np.random.seed(43)
datafile = '...'
df = pd.read_csv(datafile, sep=" ", nrows = 89)
no_feat = 500
no_samp = 5
no_trees = 5
i=0
iter=0
samples = np.zeros((no_trees, no_samp))
features = np.zeros((no_trees, no_feat))
while i < no_trees:
rand_feat = np.random.choice(df.shape[1], no_feat, replace=False)
iter_order = np.random.choice(df.shape[0], df.shape[0], replace=False)
samp_idx = []
a=0
#--------------
#how to run in parallel?
for j in iter_order:
pot_samp = df.iloc[j, rand_feat]
if len(np.where(pot_samp==-1)[0]) == 0:
samp_idx.append(j)
if len(samp_idx) == no_samp:
print a
break
a+=1
#--------------
if len(samp_idx) == no_samp:
samples[i,:] = samp_idx
features[i, :] = rand_feat
i+=1
iter+=1
if iter>1000: #break if subsets cannot be found
break
Searching for fitting samples is the potentially expensive part (the j for loop), which in theory can be run in parallel. In some cases, it is not necessary to iterate over all samples to find a large enough subset, which is why I am breaking out of the loop as soon as the subset is large enough.
I am struggling to find an implementation that would allow for checks of how many valid results are generated already. Is it even possible?
I have used joblib before. If I understand correctly this uses the pool methods of multiprocessing as a backend which only works for separate tasks? I am thinking that queues might be helpful but thus far I failed at implementing them.
I found a working solution. I decided to run the while loop in parallel and have the different processes interact over a shared counter. Furthermore, I vectorized the search for suitable samples.
The vectorization yielded a ~300x speedup and running on 4 cores speeds up the computation ~twofold.
First I tried to implement separate processes and put the results into a queue. Turns out these aren't made to store large amounts of data.
If someone sees another bottleneck in that code I would be glad if someone pointed it out.
With my basically nonexistent knowledge about parallel computing I found it really hard to puzzle this together, especially since the example on the internet are all very basic. I learnt a lot though =)
My code:
import numpy as np
import pandas as pd
import itertools
from multiprocessing import Pool, Lock, Value
from datetime import datetime
import settings
val = Value('i', 0)
worker_ID = Value('i', 1)
lock = Lock()
def findSamp(no_trees, df, no_feat, no_samp):
lock.acquire()
print 'starting worker - {0}'.format(worker_ID.value)
worker_ID.value +=1
worker_ID_local = worker_ID.value
lock.release()
max_iter = 100000
samp = []
feat = []
iter_outer = 0
iter = 0
while val.value < no_trees and iter_outer<max_iter:
rand_feat = np.random.choice(df.shape[1], no_feat, replace=False
#get samples with random features from dataset;
#find and select samples that don't have missing values in the random features
samp_rand = df.iloc[:,rand_feat]
nan_idx = np.unique(np.where(samp_rand == -1)[0])
all_idx = np.arange(df.shape[0])
notnan_bool = np.invert(np.in1d(all_idx, nan_idx))
notnan_idx = np.where(notnan_bool == True)[0]
if notnan_idx.shape[0] >= no_samp:
#if enough samples for random feature subset, select no_samp samples randomly
notnan_idx_rand = np.random.choice(notnan_idx, no_samp, replace=False)
rand_feat_rand = rand_feat
lock.acquire()
val.value += 1
#x = val.value
lock.release()
#print 'no of trees generated: {0}'.format(x)
samp.append(notnan_idx_rand)
feat.append(rand_feat_rand)
else:
#increase iter_outer counter if no sample subset could be found for random feature subset
iter_outer += 1
iter+=1
if iter >= max_iter:
print 'exiting worker{0} because iter >= max_iter'.format(worker_ID_local)
else:
print 'worker{0} - finished'.format(worker_ID_local)
return samp, feat
def initialize(*args):
global val, worker_ID, lock
val, worker_ID, lock = args
def star_findSamp(i_df_no_feat_no_samp):
return findSamp(*i_df_no_feat_no_samp)
if __name__ == '__main__':
np.random.seed(43)
datafile = '...'
df = pd.read_csv(datafile, sep=" ", nrows = 89)
df = df.fillna(-1)
df = df.iloc[:, 6:]
no_feat = 700
no_samp = 10
no_trees = 5000
startTime = datetime.now()
print 'starting multiprocessing'
ncores = 4
p = Pool(ncores, initializer=initialize, initargs=(val, worker_ID, lock))
args = itertools.izip([no_trees]*ncores, itertools.repeat(df), itertools.repeat(no_feat), itertools.repeat(no_samp))
result = p.map(star_findSamp, args)#, callback=log_result)
p.close()
p.join()
print '{0} sample subsets for tree training have been found'.format(val.value)
samples = [x[0] for x in result if x != None]
samples = np.vstack(samples)
features = [x[1] for x in result if x != None]
features = np.vstack(features)
print datetime.now() - startTime