Consider this code snippet
from tqdm import trange
def main_game(depth1, depth2):
# some operator with complexity O(20^max(depth1,depth2))
return depth1+depth2
DEPTH_MAX = 5
total = 0
for depth1 in range(1, DEPTH_MAX + 1):
for depth2 in range(1, DEPTH_MAX + 1):
for i in trange(100):
total += main_game(depth1, depth2)
print(total)
I'm using minimax algorithm in main_game() with branching factor = 10
Now, since the third for-loop has a time-consuming function (up to 100*O(20^5) in time complexity), is there any way I can make it run faster? I'm thinking of parallelizing (multithreading for example). Any suggestion?
Use multiprocessing, and from there Pool().starmap(). starmap() feeds your function with the prepared tuples of arguments in a parallelized manner. And collects the result synchronously.
If the order of the result doesn't matter, you could use the asynchronous version .starmap_async().get().
There are also Pool().apply() and Pool.map() with their _async() versions, but you actually need just to learn Pool().starmap(). It is only some Syntax difference.
import multiprocessing as mp
n_cpu = mp.cpu_count()
# let's say your function is a diadic function (takes two arguments)
def main_game(depth1, depth2):
return depth1 + depth2
DEPTH_MAX = 5
depths = list(range(1, DEPTH_MAX + 1))
# let's pre-prepare the arguments - because that goes fast!
depth1_depth2_pairs = [(d1, d2) for d1 in depths for d2 in depths]
# 1: Init multiprocessing.Pool()
pool = mp.Pool(n_cpu)
# 2: pool.starmap()
results = pool.starmap(main_game, depth_1_depth_2_pairs)
# 3: pool.close()
pool.close()
total = sum(results) # this does your `total +=`
## in this case, you could even use
results = pool.starmap_async(main_game, depth_1_depth_2_pairs).get()
## because the order doesn't matter, if you sum them all up
## which is commutative.
This all you can write slightly more nicer using the with construct (it does the closing automatically, even if an error occurs, so it does not just save you typing but is more secure.
import multiprocessing as mp
n_cpu = mp.cpu_count()
def main_game(depth1, depth2):
return depth1 + depth2
DEPTH_MAX = 5
depths = range(1, DEPTH_MAX + 1)
depth1_depth2_pairs = [(d1, d2) for d1 in depths for d2 in depths]
with mp.Pool(n_cpu) as pool:
results = pool.starmap_async(main_game, depth_1_depth_2_pairs).get()
total = sum(results)
Related
I have a program that I created using threads, but then I learned that threads don't run concurrently in python and processes do. As a result, I am trying to rewrite the program using multiprocessing, but I am having a hard time doing so. I have tried following several examples that show how to create the processes and pools, but I don't think it's exactly what I want.
Below is my code with the attempts I have tried. The program tries to estimate the value of pi by randomly placing points on a graph that contains a circle. The program takes two command-line arguments: one is the number of threads/processes I want to create, and the other is the total number of points to try placing on the graph (N).
import math
import sys
from time import time
import concurrent.futures
import random
import multiprocessing as mp
def myThread(arg):
# Take care of imput argument
n = int(arg)
print("Thread received. n = ", n)
# main calculation loop
count = 0
for i in range (0, n):
x = random.uniform(0,1)
y = random.uniform(0,1)
d = math.sqrt(x * x + y * y)
if (d < 1):
count = count + 1
print("Thread found ", count, " points inside circle.")
return count;
# end myThread
# receive command line arguments
if (len(sys.argv) == 3):
N = sys.argv[1] # original ex: 0.01
N = int(N)
totalThreads = sys.argv[2]
totalThreads = int(totalThreads)
print("N = ", N)
print("totalThreads = ", totalThreads)
else:
print("Incorrect number of arguments!")
sys.exit(1)
if ((totalThreads == 1) or (totalThreads == 2) or (totalThreads == 4) or (totalThreads == 8)):
print()
else:
print("Invalid number of threads. Please use 1, 2, 4, or 8 threads.")
sys.exit(1)
# start experiment
t = int(time() * 1000) # begin run time
total = 0
# ATTEMPT 1
# processes = []
# for i in range(totalThreads):
# process = mp.Process(target=myThread, args=(N/totalThreads))
# processes.append(process)
# process.start()
# for process in processes:
# process.join()
# ATTEMPT 2
#pool = mp.Pool(mp.cpu_count())
#total = pool.map(myThread, [N/totalThreads])
# ATTEMPT 3
#for i in range(totalThreads):
#total = total + pool.map(myThread, [N/totalThreads])
# p = mp.Process(target=myThread, args=(N/totalThreads))
# p.start()
# ATTEMPT 4
# with concurrent.futures.ThreadPoolExecutor() as executor:
# for i in range(totalThreads):
# future = executor.submit(myThread, N/totalThreads) # start thread
# total = total + future.result() # get result
# analyze results
pi = 4 * total / N
print("pi estimate =", pi)
delta_time = int(time() * 1000) - t # calculate time required
print("Time =", delta_time, " milliseconds")
I thought that creating a loop from 0 to totalThreads that creates a process for each iteration would work. I also wanted to pass in N/totalThreads (to divide the work), but it seems that processes take in an iterable list rather than an argument to pass to the method.
What is it I am missing with multiprocessing? Is it at all possible to even do what I want to do with processes?
Thank you in advance for any help, it is greatly appreciated :)
I have simplified your code and used some hard-coded values which may or may not be reasonable.
import math
import concurrent.futures
import random
from datetime import datetime
def myThread(arg):
count = 0
for i in range(0, arg[0]):
x = random.uniform(0, 1)
y = random.uniform(0, 1)
d = math.sqrt(x * x + y * y)
if (d < 1):
count += 1
return count
N = 10_000
T = 8
_start = datetime.now()
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = {executor.submit(myThread, (int(N / T),)): _ for _ in range(T)}
total = 0
for future in concurrent.futures.as_completed(futures):
total += future.result()
_end = datetime.now()
print(f'Estimate for PI = {4 * total / N}')
print(f'Run duration = {_end-_start}')
A typical output on my machine looks like this:-
Estimate for PI = 3.1472
Run duration = 0:00:00.008895
Bear in mind that the number of threads you start is effectively managed by the ThreadPoolExecutor (TPE) [ when constructed with no parameters ]. It makes decisions about the number of threads that can run based on your machine's processing capacity (number of cores etc). Therefore you could, if you really wanted to, set T to a very high number and the TPE will block execution of any new threads until it determines that there is capacity.
I read a lot of posts about parallelization using the multiprocessing module but none of them quite answered my question.
I have a very long generator giving me parameter values and for each I want to compute some function value. However, I only want to save the best n many, since I am only interested in the best ones and saving all of the results would blow up the RAM.
There way I see it, there are two ways to do this: 1) use a common shared memory between the processes where the best values are saved or 2) keep separate lists of the best results for each core/process and later manually merge these lists together.
I think the second method would be better, however I am not sure how to implement this.
This is what I got so far:
import numpy as np
import multiprocessing
from functools import partial
def get_generator(length: int):
for i in range(length):
yield [i, i + 1]
def some_func(x, other_stuff):
y = np.sum(x)
return y
def task(other_stuff, x: np.ndarray):
val = some_func(x, other_stuff)
if val > task.some_dict['min']:
task.l.append(val)
task.some_dict['min'] = val
return
def task_init(l, some_dict):
task.l = l
task.some_dict = some_dict
task.some_dict['min'] = np.NINF
n = 20
generator = get_generator(n)
other_stuff = np.nan
func = partial(task, other_stuff)
l = multiprocessing.Manager().list()
some_dict = multiprocessing.Manager().dict()
p = multiprocessing.Pool(None, task_init, [l, some_dict])
p.imap(func, generator, chunksize=10000)
p.close()
p.join()
This would be somewhat similar to what I want to do. But I really care about performance and in the actual code the comparison/saving of the best values will be more complex so I think that the shared memory approach would be really slow.
My question boils down to:
If I have e.g. 8 cores, how could I have 8 lists of the best results each for one core that will be returned, so that the cores work completely independent and rather quick?
Thank you very much!
These are my comments put into action. I hope your actual task is a more complicated computation or it would be hardly worth using multiprocessing.
import numpy as np
import multiprocessing
from functools import partial
from heapq import *
def get_generator(length: int):
for i in range(length):
yield [i, i + 1]
def some_func(x, other_stuff):
y = np.sum(x)
return y
def task(other_stuff, x: np.ndarray):
val = some_func(x, other_stuff)
return val
def main():
n = 20
generator = get_generator(n)
other_stuff = np.nan
func = partial(task, other_stuff)
cpu_count = multiprocessing.cpu_count() - 1 # leave a processor for the main process
chunk_size = n // cpu_count
HEAPSIZE = 8
with multiprocessing.Pool(cpu_count) as pool:
heap = []
for val in pool.imap_unordered(func, generator, chunksize=chunk_size):
if len(heap) < HEAPSIZE:
heappush(heap, val)
elif val > heap[0]:
heappushpop(heap, val)
# sort
values = sorted(heap, reverse=True)
print(values)
if __name__ == '__main__':
main()
Prints:
[39, 37, 35, 33, 31, 29, 27, 25]
Update
I found it best with the following experiment to allocate to the pool a number of processes equal to mp.cpu_count() - 1 to leave the main process a free proceesor to handle the results returned by the workers. I also experimented with the chunksize parameter:
import multiprocessing as mp
import timeit
def worker_process(i):
s = 0
for n in range(10000):
s += i * i # square the argument
s /= 10000
return s
def main():
cpu_count = mp.cpu_count() - 1 # leave a processor for the main process
N = 10000
chunk_size = N // cpu_count # 100 may be good enough
results = []
with mp.Pool(cpu_count) as pool:
for result in pool.imap_unordered(worker_process, range(N), chunksize=chunk_size):
results.append(result)
#print(results[0:10])
if __name__ == '__main__':
print(timeit.timeit(stmt='main()', number=10, globals=globals()) / 10)
On my desktop (running other processes, such as streaming music), the above code did better with assigning mp.cpu_count() - 1 to cpu_count (2.4 seconds vs, 2.5 seconds). Here are other timings (rounded to one decimal place):
chunksize = 1428 -> 2.4 seconds (N // (mp.cpu_count() - 1)
chunksize = 1000 -> 2.7 seconds
chunksize = 100 -> 2.4 seconds
chunksize = 10 -> 2.4 seconds
chunksize = 1 -> 2.6 seconds
The result for a chunksize value of 1000 is a bit of an anomaly. I would suggest trying different values, otherwise N // (mp.cpu_count() - 1). This is assuming you can compute N, the number of items in the iterable. When you have a generator as the iterable, you would have to, in the general case, convert it first to a list, to be able to get its length. Even a chunksize value of 1 in this particular benchmark did not do that much worse. But this is what I have learned from varying the amount of work worker_process has to do:
The more work (i.e. CPU) your worker process has to do to complete its task, the less sensitive it is to the chunksize parameter. If it returns after using very little CPU, then the overhead of transferring the next chunk becomes significant and you want to keep the number of chunk transfers to a small value (i.e. you want a large chunksize value). But if the process is long running, the overhead of transferring the next chunk will not be as impactful.
In the following code the worker process's CPU requirements are trivial:
import multiprocessing as mp
import timeit
def worker_process(i):
return i ** 2
def main():
cpu_count = mp.cpu_count() - 1
N = 100000
chunk_size = N // cpu_count
results = []
with mp.Pool(cpu_count) as pool:
for result in pool.imap_unordered(worker_process, range(N), chunksize=chunk_size):
results.append(result)
print(results[0:10])
if __name__ == '__main__':
print(timeit.timeit(stmt='main()', number=10, globals=globals()) / 10)
The timings:
chunksize = 1428 -> .19 seconds
chunksize = 100 -> .39 seconds
chunksize = 1 -> 11.06 seconds
In the following code the worker process's CPU requirements are more substantial:
import multiprocessing as mp
import timeit
def worker_process(i):
s = 0
for _ in range(1000000):
s += i * i
return s // 1000000
def main():
cpu_count = mp.cpu_count() - 1
N = 1000
chunk_size = N // cpu_count
results = []
with mp.Pool(cpu_count) as pool:
for result in pool.imap_unordered(worker_process, range(N), chunksize=chunk_size):
results.append(result)
print(results[0:10])
if __name__ == '__main__':
print(timeit.timeit(stmt='main()', number=3, globals=globals()) / 3)
The timings:
chunksize = 142 -> 22.6 seconds (N // (mp.cpu_count() - 1)
chunksize = 10 -> 23.5 seconds
chunksize = 1 -> 23.2 seconds
Update 2
According to Python multiprocessing: understanding logic behind chunksize, when methods map, starmap or map_async are called with chunksize=None there is a specific algorithm used to compute a chunksize, which I have used in the code below. I don't know why the default value for methods imap and imap_unordered is 1 and does not use this same algorithm. Perhaps because that wouldn't be "lazy" as implied by the description of these methods. In the following code, which repeats the previous benchmark, I use a redefinition of the same algorithm for computing the default chunksize:
import multiprocessing as mp
import timeit
def worker_process(i):
s = 0
for _ in range(1000000):
s += i * i
return s // 1000000
def compute_chunksize(pool_size, iterable_size):
if iterable_size == 0:
return 0
chunksize, extra = divmod(iterable_size, pool_size * 4)
if extra:
chunksize += 1
return chunksize
def main():
cpu_count = mp.cpu_count() - 1
N = 1000
chunk_size = compute_chunksize(cpu_count, N)
print('chunk_size =', chunk_size)
results = []
with mp.Pool(cpu_count) as pool:
for result in pool.imap_unordered(worker_process, range(N), chunksize=chunk_size):
results.append(result)
print(results[0:10])
if __name__ == '__main__':
print(timeit.timeit(stmt='main()', number=3, globals=globals()) / 3)
Timings:
chunksize 36 -> 22.2 seconds
I'm playing around with the multiprocessing module in python and trying to parallelize an algorithm that loops through an list with a different increment value each time (modification of the Sieve of Eratosthenes algorithm). Therefore, I want to have a shared list between all of the processes so that all the processes are modifying the same list. I've tried with the multiprocessing.Array function, but when I reach the end of the program the array is still unmodified and still contains all 0's (the value that I initialized it to).
import multiprocessing
import math
num_cores = multiprocessing.cpu_count()
lower = 0
mark = None
def mark_array(k):
global mark
index = (-(-lower//k)*k)-lower
for i in range(index, len(mark), k):
mark[i] = 1
def sieve(upper_bound, lower_bound):
size = upper_bound - lower_bound + 1
global mark
mark = multiprocessing.Array('i', size, lock=False)
for i in range(size):
mark[i] = 0
klimit = int(math.sqrt(upper_bound)) + 1
global lower
lower = lower_bound
if __name__ == '__main__':
pool = multiprocessing.Pool(processes=num_cores)
inputs = list(range(2, klimit+1))
pool.map(mark_array, inputs)
pool.close()
pool.join()
result = []
for i in range(size):
result.append(mark[i])
print(result)
sieve(200,100)
Pardon the code. It's a bit messy, but I'm just trying to get the shared memory to work before I clean it up.
EDIT: Ok, so I tried the exact same code on a linux machine and there I get my expected output. However, running the same code in VS code on a Windows machine does not. Any idea why?
EDIT#2: This seems to be a Windows specific issue as the Windows OS handles processes differently than Linux. If this is the case, any idea how to solve it?
You could try to use multiprocessing.Manager for your task:
import multiprocessing
import math
from functools import partial
num_cores = multiprocessing.cpu_count()
lower = 0
def mark_array(mark, k):
index = (-(-lower // k) * k) - lower
for i in range(index, len(mark), k):
mark[i] = 1
def sieve(upper_bound, lower_bound):
size = upper_bound - lower_bound + 1
klimit = int(math.sqrt(upper_bound)) + 1
global lower
lower = lower_bound
if __name__ == '__main__':
pool = multiprocessing.Pool(processes=num_cores)
with multiprocessing.Manager() as manager:
mark = manager.list(range(size))
for i in range(size):
mark[i] = 0
inputs = list(range(2, klimit + 1))
foo = partial(mark_array, mark)
pool.map(foo, inputs)
pool.close()
pool.join()
result = []
for i in range(size):
result.append(mark[i])
print(result)
sieve(200, 100)
Basically have a script that combs a dataset of nodes/points to remove those that overlap. The actual script is more complicated but I pared it down to basically a simple overlap check that does nothing with it for demonstration.
I tried a few variants with locks, queues, pools adding one job at a time versus adding in bulk. Some of the worst offenders were slower by a couple order of magnitude. Eventually I got it to the fastest I could.
The overlap checking algorithm which is send to the individual processes:
def check_overlap(args):
tolerance = args['tolerance']
this_coords = args['this_coords']
that_coords = args['that_coords']
overlaps = False
distance_x = this_coords[0] - that_coords[0]
if distance_x <= tolerance:
distance_x = pow(distance_x, 2)
distance_y = this_coords[1] - that_coords[1]
if distance_y <= tolerance:
distance = pow(distance_x + pow(distance_y, 2), 0.5)
if distance <= tolerance:
overlaps = True
return overlaps
The processing function:
def process_coords(coords, num_processors=1, tolerance=1):
import multiprocessing as mp
import time
if num_processors > 1:
pool = mp.Pool(num_processors)
start = time.time()
print "Start script w/ multiprocessing"
else:
num_processors = 0
start = time.time()
print "Start script w/ standard processing"
total_overlap_count = 0
# outer loop through nodes
start_index = 0
last_index = len(coords) - 1
while start_index <= last_index:
# nature of the original problem means we can process all pairs of a single node at once, but not multiple, so batch jobs by outer loop
batch_jobs = []
# inner loop against all pairs for this node
start_index += 1
count_overlapping = 0
for i in range(start_index, last_index+1, 1):
if num_processors:
# add job
batch_jobs.append({
'tolerance': tolerance,
'this_coords': coords[start_index],
'that_coords': coords[i]
})
else:
# synchronous processing
this_coords = coords[start_index]
that_coords = coords[i]
distance_x = this_coords[0] - that_coords[0]
if distance_x <= tolerance:
distance_x = pow(distance_x, 2)
distance_y = this_coords[1] - that_coords[1]
if distance_y <= tolerance:
distance = pow(distance_x + pow(distance_y, 2), 0.5)
if distance <= tolerance:
count_overlapping += 1
if num_processors:
res = pool.map_async(check_overlap, batch_jobs)
results = res.get()
for r in results:
if r:
count_overlapping += 1
# stuff normally happens here to process nodes connected to this node
total_overlap_count += count_overlapping
print total_overlap_count
print " time: {0}".format(time.time() - start)
And testing function:
from random import random
coords = []
num_coords = 1000
spread = 100.0
half_spread = 0.5*spread
for i in range(num_coords):
coords.append([
random()*spread-half_spread,
random()*spread-half_spread
])
process_coords(coords, 1)
process_coords(coords, 4)
Still, the non-multiprocessing runs in less than 0.4s consistently and the multiprocessing I can get just under 3.0s as it stands above. I get that maybe the algorithm here is too simple to really reap benefits, but considering the above case has nearly half a million iterations and the real case has significantly more, it's weird to me that the multiprocessing is an order of magnitude slower.
What am I missing / what can I do to improve?
Building O(N**2) 3-element dicts not used in the serialized code, and transmitting them over interprocess pipes, is a pretty good way to guarantee multiprocessing can't help ;-) Nothing comes for free - everything costs.
Below is a rewrite that executes much the same code regardless of whether it's run in serial or multiprocessing modes. No new dicts, etc. In general, the larger len(coords), the more benefit it gets from multiprocessing. On my box, at 20000 the multiprocessing run takes about a third of the wall-clock time.
Key to this is that all processes have their own copy of coords. This is done below by transmitting it just once, when the pool is created. That should work on all platforms. On Linux-y systems, it could happen "by magic" instead via forked process inheritance. Reducing the amount of data sent across processes from O(N**2) to O(N) is a huge improvement.
Getting more out of multiprocessing would require better load balancing. As is, a call to check_overlap(i) compares coords[i] to each value in coords[i+1:]. The larger i, the less work there is for it to do, and for the largest values of i just the cost of transmitting i between processes - and transmitting the result back - swamps the amount of time spent in check_overlap(i).
def init(*args):
global _coords, _tolerance
_coords, _tolerance = args
def check_overlap(start_index):
coords, tolerance = _coords, _tolerance
tsq = tolerance ** 2
overlaps = 0
start0, start1 = coords[start_index]
for i in range(start_index + 1, len(coords)):
that0, that1 = coords[i]
dx = abs(that0 - start0)
if dx <= tolerance:
dy = abs(that1 - start1)
if dy <= tolerance:
if dx**2 + dy**2 <= tsq:
overlaps += 1
return overlaps
def process_coords(coords, num_processors=1, tolerance=1):
global _coords, _tolerance
import multiprocessing as mp
_coords, _tolerance = coords, tolerance
import time
if num_processors > 1:
pool = mp.Pool(num_processors, initializer=init, initargs=(coords, tolerance))
start = time.time()
print("Start script w/ multiprocessing")
else:
num_processors = 0
start = time.time()
print("Start script w/ standard processing")
N = len(coords)
if num_processors:
total_overlap_count = sum(pool.imap_unordered(check_overlap, range(N)))
else:
total_overlap_count = sum(check_overlap(i) for i in range(N))
print(total_overlap_count)
print(" time: {0}".format(time.time() - start))
if __name__ == "__main__":
from random import random
coords = []
num_coords = 20000
spread = 100.0
half_spread = 0.5*spread
for i in range(num_coords):
coords.append([
random()*spread-half_spread,
random()*spread-half_spread
])
process_coords(coords, 1)
process_coords(coords, 4)
I want to use multiprocessing in Python to speed up a while loop.
More specifically:
I have a matrix (samples*features). I want to select x subsets of samples whose values at a random subset of features is unequal to a certain value (-1 in this case).
My serial code:
np.random.seed(43)
datafile = '...'
df = pd.read_csv(datafile, sep=" ", nrows = 89)
no_feat = 500
no_samp = 5
no_trees = 5
i=0
iter=0
samples = np.zeros((no_trees, no_samp))
features = np.zeros((no_trees, no_feat))
while i < no_trees:
rand_feat = np.random.choice(df.shape[1], no_feat, replace=False)
iter_order = np.random.choice(df.shape[0], df.shape[0], replace=False)
samp_idx = []
a=0
#--------------
#how to run in parallel?
for j in iter_order:
pot_samp = df.iloc[j, rand_feat]
if len(np.where(pot_samp==-1)[0]) == 0:
samp_idx.append(j)
if len(samp_idx) == no_samp:
print a
break
a+=1
#--------------
if len(samp_idx) == no_samp:
samples[i,:] = samp_idx
features[i, :] = rand_feat
i+=1
iter+=1
if iter>1000: #break if subsets cannot be found
break
Searching for fitting samples is the potentially expensive part (the j for loop), which in theory can be run in parallel. In some cases, it is not necessary to iterate over all samples to find a large enough subset, which is why I am breaking out of the loop as soon as the subset is large enough.
I am struggling to find an implementation that would allow for checks of how many valid results are generated already. Is it even possible?
I have used joblib before. If I understand correctly this uses the pool methods of multiprocessing as a backend which only works for separate tasks? I am thinking that queues might be helpful but thus far I failed at implementing them.
I found a working solution. I decided to run the while loop in parallel and have the different processes interact over a shared counter. Furthermore, I vectorized the search for suitable samples.
The vectorization yielded a ~300x speedup and running on 4 cores speeds up the computation ~twofold.
First I tried to implement separate processes and put the results into a queue. Turns out these aren't made to store large amounts of data.
If someone sees another bottleneck in that code I would be glad if someone pointed it out.
With my basically nonexistent knowledge about parallel computing I found it really hard to puzzle this together, especially since the example on the internet are all very basic. I learnt a lot though =)
My code:
import numpy as np
import pandas as pd
import itertools
from multiprocessing import Pool, Lock, Value
from datetime import datetime
import settings
val = Value('i', 0)
worker_ID = Value('i', 1)
lock = Lock()
def findSamp(no_trees, df, no_feat, no_samp):
lock.acquire()
print 'starting worker - {0}'.format(worker_ID.value)
worker_ID.value +=1
worker_ID_local = worker_ID.value
lock.release()
max_iter = 100000
samp = []
feat = []
iter_outer = 0
iter = 0
while val.value < no_trees and iter_outer<max_iter:
rand_feat = np.random.choice(df.shape[1], no_feat, replace=False
#get samples with random features from dataset;
#find and select samples that don't have missing values in the random features
samp_rand = df.iloc[:,rand_feat]
nan_idx = np.unique(np.where(samp_rand == -1)[0])
all_idx = np.arange(df.shape[0])
notnan_bool = np.invert(np.in1d(all_idx, nan_idx))
notnan_idx = np.where(notnan_bool == True)[0]
if notnan_idx.shape[0] >= no_samp:
#if enough samples for random feature subset, select no_samp samples randomly
notnan_idx_rand = np.random.choice(notnan_idx, no_samp, replace=False)
rand_feat_rand = rand_feat
lock.acquire()
val.value += 1
#x = val.value
lock.release()
#print 'no of trees generated: {0}'.format(x)
samp.append(notnan_idx_rand)
feat.append(rand_feat_rand)
else:
#increase iter_outer counter if no sample subset could be found for random feature subset
iter_outer += 1
iter+=1
if iter >= max_iter:
print 'exiting worker{0} because iter >= max_iter'.format(worker_ID_local)
else:
print 'worker{0} - finished'.format(worker_ID_local)
return samp, feat
def initialize(*args):
global val, worker_ID, lock
val, worker_ID, lock = args
def star_findSamp(i_df_no_feat_no_samp):
return findSamp(*i_df_no_feat_no_samp)
if __name__ == '__main__':
np.random.seed(43)
datafile = '...'
df = pd.read_csv(datafile, sep=" ", nrows = 89)
df = df.fillna(-1)
df = df.iloc[:, 6:]
no_feat = 700
no_samp = 10
no_trees = 5000
startTime = datetime.now()
print 'starting multiprocessing'
ncores = 4
p = Pool(ncores, initializer=initialize, initargs=(val, worker_ID, lock))
args = itertools.izip([no_trees]*ncores, itertools.repeat(df), itertools.repeat(no_feat), itertools.repeat(no_samp))
result = p.map(star_findSamp, args)#, callback=log_result)
p.close()
p.join()
print '{0} sample subsets for tree training have been found'.format(val.value)
samples = [x[0] for x in result if x != None]
samples = np.vstack(samples)
features = [x[1] for x in result if x != None]
features = np.vstack(features)
print datetime.now() - startTime