python MPI for dictionary iteration

python MPI for dictionary iteration - python

I would like to split the iteration of dictionary using MPI (mpi4.py)(message passing interface).
for example,
from mpi4py import MPI
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
tmp_list = []
for key, value in some_dict.items():
tmp_values = some_function(key,value)
tmp_list.append(tmp_values)
there is some simple code.
How do i compose the MPI code for the iteration.

You will first need to convert the dict into a list and then divide the list into a number of processes that you will be using. This is necessary so that comm.scatter can send parts of data across all processes. And then final results could be gathered using comm.gather
script.py
#!/usr/bin/python
from mpi4py import MPI
comm = MPI.COMM_WORLD
size = comm.Get_size() # get number of processes
rank = comm.Get_rank() # get the current rank of process
def some_fun(x,y): # some random function
return x+y
def chunkIt(seq, num): #function to chunk list into {size} parts for scatter to work
avg = len(seq) / float(num)
out = []
last = 0.0
while last < len(seq):
out.append(seq[int(last):int(last + avg)])
last += avg
return out
some_dict = {i: i**2 for i in range(100)} # some random data to work on
if rank == 0:
data = chunkIt(list(some_dict.items()), size) # convert dict into list first and then divide the data into {size} parts
# print(f"rank: {rank} / data: {data}")
else:
data = None
data = comm.scatter(data, root=0) # scatter the data accross given processes
print(f"rank: {rank} / data: {data}\n")
sub_dict = dict(data) # convert the list into dict
tmp_list = [] #local to each process
for key, value in sub_dict.items():
tmp_values = some_fun(key, value)
tmp_list.append(tmp_values)
tmp_list = comm.gather(tmp_list, root=0) # gathering data from all procs to root proc
if rank == 0:
print(f"length of tmp_list on rank: {rank} is: {len(tmp_list)}")
print(f"tmp_list: {tmp_list}") #tmp_list is list ot lists. make sure to convert it into required ds
else:
assert tmp_list is None
make it executable using chmod
chmod +x script.py
and then run
mpiexec -n 4 script.py
-n is the number of processes to run
Note: I am using ubuntu 16.04 and python 3.7.10 and mpi4py==3.0.3

Related

ProcessPoolExecutor on shared dataset and multiple arguments

I am facing an issue I was not able to solve by doing some search on the web.
I am using the minimal code below. The goal is to run some function 'f_sum' several million times by multiprocessing (using the ProcessPoolExecutor). I am adding multiple arguments by a list of tuples 'args'. In addition, the function is supposed to use some sort of data which is the same for all executions (in the example it's just one number). I do not want to add the data to the 'args' tuple for memory reasons.
The only option I found so far is adding the data outside of the "if name == 'main'". This will (for some reason that I do not understand) make the variable available to all processes. However, updating is not possible. Also, I do not really want to make the data definition outside because in the actual code it will be based on data import and might require additional manipulation.
Hope you can help and thanks in advance!
PS: I am using Python 3.7.9 on Win 10.
from concurrent.futures import ProcessPoolExecutor
import numpy as np
data = 0 # supposed to be a large data set & shared among all calculations)
num_workers = 6 # number of CPU cores
num_iterations = 10 # supposed to be large number
def f_sum(args):
(x,y) = args
print('This is process', x, 'with exponent:', y)
value = 0
for i in range(10**y):
value += i
return value/10**y + data
def multiprocessing(func, args, workers):
with ProcessPoolExecutor(workers) as executor:
results = executor.map(func, args)
return list(results)
if __name__ == '__main__':
data = 0.5 # try to update data, should not be part of 'args' due to memory
args = []
for k in range(num_iterations):
args.append((k, np.random.randint(1,8)))
result = multiprocessing(f_sum, args, num_workers)
if np.abs(result[0]-np.round(result[0])) > 0:
print('data NOT updated')
Edit to original question:
>> Performance Example 1
from concurrent.futures import ProcessPoolExecutor
import numpy as np
import time
data_size = 10**8
num_workers = 4
num_sum = 10**7
num_iterations = 100
data = np.random.randint(0,100,size=data_size)
# data = np.linspace(0,data_size,data_size+1, dtype=np.uintc)
def f_sum(args):
(x,y) = args
print('This is process', x, 'random number:', y, 'last data', data[-1])
value = 0
for i in range(num_sum):
value += i
result = value - num_sum*(num_sum-1)/2 + data[-1]
return result
def multiprocessing(func, args, workers):
with ProcessPoolExecutor(workers) as executor:
results = executor.map(func, args)
return list(results)
if __name__ == '__main__':
t0 = time.time()
args = []
for k in range(num_iterations):
args.append((k, np.random.randint(1,10)))
result = multiprocessing(f_sum, args, num_workers)
print(f'expected result: {data[-1]}, actual result: {np.unique(result)}')
t1 = time.time()
print(f'total time: {t1-t0}')
>> Output
This is process 99 random number: 6 last data 9
expected result: 86, actual result: [ 3. 9. 29. 58.]
total time: 11.760863542556763
Leads to false result if randint is used. For linspace result is correct.
>> Performance Example 2 - based on proposal in answer
from concurrent.futures import ProcessPoolExecutor
import numpy as np
from multiprocessing import Array
import time
data_size = 10**8
num_workers = 4
num_sum = 10**7
num_iterations = 100
input = np.random.randint(0, 100, size=data_size)
# input = np.linspace(0, data_size, data_size + 1, dtype=np.uintc)
def f_sum(args):
(x,y) = args
print('This is process', x, 'random number:', y, 'last data', data[-1])
value = 0
for i in range(num_sum):
value += i
result = value - num_sum*(num_sum-1)/2 + data[-1]
return result
def init_pool(the_data):
global data
data = the_data
def multiprocessing(func, args, workers, input):
data = Array('i', input, lock=False)
with ProcessPoolExecutor(max_workers=workers, initializer=init_pool, initargs=(data,)) as executor:
results = list(executor.map(func, args))
return results
if __name__ == '__main__':
t0 = time.time()
args = []
for k in range(num_iterations):
args.append((k, np.random.randint(1,10)))
result = multiprocessing(f_sum, args, num_workers, input)
print(f'expected result: {input[-1]}, actual result:{np.unique(result)}')
t1 = time.time()
print(f'total time: {t1-t0}')
>> Output
This is process 99 random number: 7 last data 29
expected result: 29, actual result: [29.]
total time: 30.8266122341156
#Booboo
I added two examples to my original question, the "Performance Example 2" is based on your code. First interesting finding, my original code actually gives incorrect results if the data array is initialized with random integers. I noticed, that each process by itself initializes the data array. Since it is based on random numbers each process uses a different array for calculation, and even different than the main. So that use case would not work with this code, in your code it is correct all the time.
If using linspace, however, it works, since this gives the same result each time. Same would be true for the use case where some data is read from a file (which is my actual use case). Example 1 is still about 3x faster than Example 2, and I think the time is mainly used by the initializing of the array in your method.
Regarding memory usage I don't see a relevant difference in my task manager. Both Example produce a similar increase in memory, even if the shape is different.
I still believe that your method is the correct approach, however, memory usage seems to be similar and speed is slower in the example above.

The most efficient used of memory would be to use shared memory so that all processes are working on the same instance of data. This would be absolutely necessary if the processes updated data. In the example below, since the access to data is read only and I am using a simple array of integers, I am using multiprocessing.Array with no locking specified. The "trick" is to initialize your pool by specifying the initializer and initargs arguments so that each process in the pool has access to this shared memory. I have made a couple of other changes to the code, which I have commented
from concurrent.futures import ProcessPoolExecutor
import numpy as np
from multiprocessing import Array, cpu_count # new imports
def init_pool(the_data):
global data
data = the_data
def f_sum(args):
(x,y) = args
print('This is process', x, 'with exponent:', y)
value = 0
for i in range(10**y):
value += i
return value/10**y + len(data) # just use the length of data for now
def multiprocessing(func, args, workers):
data = Array('i', range(1000), lock=False) # read-only, integers 0, 1, 2, ... 999
with ProcessPoolExecutor(max_workers=workers, initializer=init_pool, initargs=(data,)) as executor:
results = list(executor.map(func, args)) # create the list of results here
print(results) # so that it can be printed out for demo purposes
return results
if __name__ == '__main__':
num_iterations = 10 # supposed to be large number
#num_workers = 6 # number of CPU cores
num_workers = cpu_count() # number of CPU cores
args = []
for k in range(num_iterations):
args.append((k, np.random.randint(1,8)))
result = multiprocessing(f_sum, args, num_workers)
if np.abs(result[0]-np.round(result[0])) > 0:
print('data NOT updated')
Prints:
This is process 0 with exponent: 2
This is process 1 with exponent: 1
This is process 2 with exponent: 4
This is process 3 with exponent: 3
This is process 4 with exponent: 5
This is process 5 with exponent: 1
This is process 6 with exponent: 5
This is process 7 with exponent: 2
This is process 8 with exponent: 6
This is process 9 with exponent: 6
[1049.5, 1004.5, 5999.5, 1499.5, 50999.5, 1004.5, 50999.5, 1049.5, 500999.5, 500999.5]
data NOT updated
Updated Example 2
You saw my comments to your question concerning Example 1.
Your Example 2 is still not ideal: You have the statement input = np.random.randint(0, 100, size=data_size) as a global being needlessly executed by every process as it is initialized for use in the process pool. Below is an updated solution that also shows one way how you can have your worker function work directly with a numpy array that is backed up a multiprocessing.Array instance so that the numpy array exists in shared memory. You don't have to use this technique for what you are doing since you are only using numpy to create random numbers (I an not sure why), but it is a useful technique to know. But you should re-rerun your code after moving the initialization code of input as I have so it is only executed once.
I don't have the occasion to work with numpy day to day but I have come to learn that it uses multiprocessing internally for many of its own functions. So it is often not the best match for use with multiprocessing, although that does not seem to be applicable here since even in the case below we are just indexing an element of an array and it would not be using a sub-process to accomplish that.
from concurrent.futures import ProcessPoolExecutor
import numpy as np
from multiprocessing import Array
import time
import ctypes
data_size = 10**8
num_workers = 4
num_sum = 10**7
num_iterations = 100
# input = np.linspace(0, data_size, data_size + 1, dtype=np.uintc)
def to_shared_array(arr, ctype):
shared_array = Array(ctype, arr.size, lock=False)
temp = np.frombuffer(shared_array, dtype=arr.dtype)
temp[:] = arr.flatten(order='C')
return shared_array
def to_numpy_array(shared_array, shape):
'''Create a numpy array backed by a shared memory Array.'''
arr = np.ctypeslib.as_array(shared_array)
return arr.reshape(shape)
def f_sum(args):
(x,y) = args
print('This is process', x, 'random number:', y, 'last data', data[-1])
value = 0
for i in range(num_sum):
value += i
result = value - num_sum*(num_sum-1)/2 + data[-1]
return result
def init_pool(shared_array, shape):
global data
data = to_numpy_array(shared_array, shape)
def multiprocessing(func, args, workers, input):
input = np.random.randint(0, 100, size=data_size)
shape = input.shape
shared_array = to_shared_array(input, ctypes.c_long)
with ProcessPoolExecutor(max_workers=workers, initializer=init_pool, initargs=(shared_array, shape)) as executor:
results = list(executor.map(func, args))
return input, results
if __name__ == '__main__':
t0 = time.time()
args = []
for k in range(num_iterations):
args.append((k, np.random.randint(1,10)))
input, result = multiprocessing(f_sum, args, num_workers, input)
print(f'expected result: {input[-1]}, actual result:{np.unique(result)}')
t1 = time.time()
print(f'total time: {t1-t0}')

MPI spawns all processes on master node

Here's my code:
import sys
sys.path.append("/apps/anaconda3/pkgs/qgis-3.8.1-py37h59d211b_0/lib")
import qgis
import os
from qgis.core import *
from mpi4py import MPI
from timeit import default_timer as dt
import numpy as np
# Initialize MPI
comm = MPI.COMM_WORLD
rank = int(comm.Get_rank())
if(rank==0):
print("Project loaded successfully")
# counter function
def counter(feats,vectors,rank):
cnt = 0
for feature in feats:
cands = vectors.getFeatures(QgsFeatureRequest().setFilterRect(feature.geometry().boundingBox()))
for area_feature in cands:
if feature.geometry().intersects(area_feature.geometry()):
cnt+=1
return cnt
start = MPI.Wtime()
# loading layers
layer_ids = list(project.mapLayers().keys())
vec = layer_ids[0]
vectorlayer = project.mapLayers()[vec]
grid = QgsVectorLayer("/home/600.shp", "grid", "ogr")
# define number of jobs
cores = comm.size()
# split data
gfeats = np.array(list(grid.getFeatures()))
subs = np.array_split(gfeats,cores)
# start process based on rank
print("rank : ",rank," len : ",len(sub[rank]))
value = counter(subs[rank],vectorlayer,rank)
ttime = MPI.Wtime()-start
# perform the reductions:
tcount = comm.reduce(value,op=MPI.SUM, root=0)
tstime = comm.reduce(ttime, op=MPI.MAX, root=0)
# Print on rank 0
if rank == 0:
print(' Rank 0: count = ',tcount)
print(' Rank 0: time = ',tstime)
Executed Using:
mpiexec -n 50 python mptest.py
In the code, I'm splitting the array into a number of processes I've mentioned, let's suppose 50, using np.array_split()
After, I'm using rank as an index and executing only that split part in each process(or rank).
HPC contains 5 nodes each with 24 cores.
On running the code, the time is same as when I've used multiprocessing module in python
which means all processes are spawning on a single node.
How can I use all 5 nodes, to execute this code and bring up maximum parallelization?
Thanks in advance!

MPI processor quantity creates error, how to implement broadcast?

I have created a python program to calculate pi. I then decided to write it with mpi4py to run with several processes. The program works, but it returns a different value for pi than the original python version. As I looked into this problem more, I found that it returns a less accurate value when I run it with more processors. Why does the MPI version change the result with more processors? Also would it make more sense to use a broadcast rather then sending lots of individual messages? How would I implement broadcast if it is more effective?
MPI version:
#!/apps/moose/miniconda/bin/python
from mpi4py import MPI
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
name = MPI.Get_processor_name()
def f(x):
return (1-(float(x)**2))**float(0.5)
n = 1000000
nm = dict()
pi = dict()
for i in range(1,size+1):
if i == size:
nm[i] = (i*n/size)+1
else:
nm[i] = i*n/size
if rank == 0:
val = 0
for i in range(0,nm[1]):
val = val+f(float(i)/float(n))
val = val*2
pi[0] = (float(2)/n)*(float(1)+val)
print name, "rank", rank, "calculated", pi[0]
for i in range(1, size):
pi[i] = comm.recv(source=i, tag=i)
number = sum(pi.itervalues())
number = "%.20f" %(number)
import time
time.sleep(0.3)
print "Pi is approximately", number
for proc in range(1, size):
if proc == rank:
val = 0
for i in range(nm[proc]+1,nm[proc+1]):
val = val+f(float(i)/float(n))
val = val*2
pi[proc] = (float(2)/n)*(float(1)+val)
comm.send(pi[proc], dest=0, tag = proc)
print name, "rank", rank, "calculated", pi[proc]
Original Python version:
#!/usr/bin/python
n = 1000000
def f(x):
return (1-(float(x)**2))**float(0.5)
val = 0
for i in range(n):
i = i+1
val = val+f(float(i)/float(n))
val = val*2
pi = (float(2)/n)*(float(1)+val)
print pi

Your code estimates by computing the area of the quarter of a disk, that is the intergral of using the trapezoidal rule.
The problem of your code is that the ranges of the i values for each process are not complete. Indeed, use a small n and print i to see what is happening. For instance, for i in range(nm[proc]+1,nm[proc+1]): must be changed to for i in range(nm[proc],nm[proc+1]):. Otherwise, i=nm[proc] is never handled.
In addition, in pi[0] = (float(2)/n)*(float(1)+val) and pi[proc] = (float(2)/n)*(float(1)+val), the term float(1) comes from x=0 in the integral. But it is counted many times, once by each process! As the number of errors varies directly with the number of processes, increasing the number of processes decreases the accuracy, which is the symptom that you have reported.
A broadcast corresponds to a situation where all processes of a communicator must get the same piece of data from a given process. On the contrary, it is here required that data from all processors must be combined using a sum to produce a result available to a single process (called "root"). The latter operation is called a reduction and it is performed by comm.Reduce().
Here is a piece of code based on yours using comm.Reduce() instead of send() and recv().
from mpi4py import MPI
import numpy as np
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
name = MPI.Get_processor_name()
def f(x):
return (1-(float(x)**2))**float(0.5)
n = 10000000
nm =np.zeros(size+1,'i')
nm[0]=1
for i in range(1,size+1):
if i == size:
nm[i]=n
else:
nm[i] = (i*n)/size
val=0
for i in range(nm[rank],nm[rank+1]):
val = val+f((float(i))/float(n))
out=np.array(0.0, 'd')
vala=np.array(val, 'd')
comm.Reduce([vala,MPI.DOUBLE],[out,MPI.DOUBLE],op=MPI.SUM,root=0)
if rank == 0:
number =(float(4)/n)*(out)+float(2)/n
number = "%.20f" %(number)
import time
time.sleep(0.3)
print "Pi is approximately", number

Parallelization/multiprocessing of conditional for loop

I want to use multiprocessing in Python to speed up a while loop.
More specifically:
I have a matrix (samples*features). I want to select x subsets of samples whose values at a random subset of features is unequal to a certain value (-1 in this case).
My serial code:
np.random.seed(43)
datafile = '...'
df = pd.read_csv(datafile, sep=" ", nrows = 89)
no_feat = 500
no_samp = 5
no_trees = 5
i=0
iter=0
samples = np.zeros((no_trees, no_samp))
features = np.zeros((no_trees, no_feat))
while i < no_trees:
rand_feat = np.random.choice(df.shape[1], no_feat, replace=False)
iter_order = np.random.choice(df.shape[0], df.shape[0], replace=False)
samp_idx = []
a=0
#--------------
#how to run in parallel?
for j in iter_order:
pot_samp = df.iloc[j, rand_feat]
if len(np.where(pot_samp==-1)[0]) == 0:
samp_idx.append(j)
if len(samp_idx) == no_samp:
print a
break
a+=1
#--------------
if len(samp_idx) == no_samp:
samples[i,:] = samp_idx
features[i, :] = rand_feat
i+=1
iter+=1
if iter>1000: #break if subsets cannot be found
break
Searching for fitting samples is the potentially expensive part (the j for loop), which in theory can be run in parallel. In some cases, it is not necessary to iterate over all samples to find a large enough subset, which is why I am breaking out of the loop as soon as the subset is large enough.
I am struggling to find an implementation that would allow for checks of how many valid results are generated already. Is it even possible?
I have used joblib before. If I understand correctly this uses the pool methods of multiprocessing as a backend which only works for separate tasks? I am thinking that queues might be helpful but thus far I failed at implementing them.

I found a working solution. I decided to run the while loop in parallel and have the different processes interact over a shared counter. Furthermore, I vectorized the search for suitable samples.
The vectorization yielded a ~300x speedup and running on 4 cores speeds up the computation ~twofold.
First I tried to implement separate processes and put the results into a queue. Turns out these aren't made to store large amounts of data.
If someone sees another bottleneck in that code I would be glad if someone pointed it out.
With my basically nonexistent knowledge about parallel computing I found it really hard to puzzle this together, especially since the example on the internet are all very basic. I learnt a lot though =)
My code:
import numpy as np
import pandas as pd
import itertools
from multiprocessing import Pool, Lock, Value
from datetime import datetime
import settings
val = Value('i', 0)
worker_ID = Value('i', 1)
lock = Lock()
def findSamp(no_trees, df, no_feat, no_samp):
lock.acquire()
print 'starting worker - {0}'.format(worker_ID.value)
worker_ID.value +=1
worker_ID_local = worker_ID.value
lock.release()
max_iter = 100000
samp = []
feat = []
iter_outer = 0
iter = 0
while val.value < no_trees and iter_outer<max_iter:
rand_feat = np.random.choice(df.shape[1], no_feat, replace=False
#get samples with random features from dataset;
#find and select samples that don't have missing values in the random features
samp_rand = df.iloc[:,rand_feat]
nan_idx = np.unique(np.where(samp_rand == -1)[0])
all_idx = np.arange(df.shape[0])
notnan_bool = np.invert(np.in1d(all_idx, nan_idx))
notnan_idx = np.where(notnan_bool == True)[0]
if notnan_idx.shape[0] >= no_samp:
#if enough samples for random feature subset, select no_samp samples randomly
notnan_idx_rand = np.random.choice(notnan_idx, no_samp, replace=False)
rand_feat_rand = rand_feat
lock.acquire()
val.value += 1
#x = val.value
lock.release()
#print 'no of trees generated: {0}'.format(x)
samp.append(notnan_idx_rand)
feat.append(rand_feat_rand)
else:
#increase iter_outer counter if no sample subset could be found for random feature subset
iter_outer += 1
iter+=1
if iter >= max_iter:
print 'exiting worker{0} because iter >= max_iter'.format(worker_ID_local)
else:
print 'worker{0} - finished'.format(worker_ID_local)
return samp, feat
def initialize(*args):
global val, worker_ID, lock
val, worker_ID, lock = args
def star_findSamp(i_df_no_feat_no_samp):
return findSamp(*i_df_no_feat_no_samp)
if __name__ == '__main__':
np.random.seed(43)
datafile = '...'
df = pd.read_csv(datafile, sep=" ", nrows = 89)
df = df.fillna(-1)
df = df.iloc[:, 6:]
no_feat = 700
no_samp = 10
no_trees = 5000
startTime = datetime.now()
print 'starting multiprocessing'
ncores = 4
p = Pool(ncores, initializer=initialize, initargs=(val, worker_ID, lock))
args = itertools.izip([no_trees]*ncores, itertools.repeat(df), itertools.repeat(no_feat), itertools.repeat(no_samp))
result = p.map(star_findSamp, args)#, callback=log_result)
p.close()
p.join()
print '{0} sample subsets for tree training have been found'.format(val.value)
samples = [x[0] for x in result if x != None]
samples = np.vstack(samples)
features = [x[1] for x in result if x != None]
features = np.vstack(features)
print datetime.now() - startTime

Multi-threading in python with loop

I'm trying to solve Problem 8 in project euler with multi-threading technique in python.
Find the greatest product of five consecutive digits in the 1000-digit number. The number can be found here.
My approach is to generate product from chunks of 5 from the original list and repeat this process 5 times, each with the starting index shifted one to the right.
Here is my thread class
class pThread(threading.Thread):
def __init__(self, l):
threading.Thread.__init__(self)
self.l = l
self.p = 0
def run(self):
def greatest_product(l):
"""
Divide the list into chunks of 5 and find the greatest product
"""
def product(seq):
return reduce(lambda x,y : x*y, seq)
def chunk_product(l, n=5):
for i in range(0, len(l), n):
yield product(l[i:i+n])
result = 0
for p in chunk_product(num):
result = result > p and result or p
return result
self.p = greatest_product(self.l)
When I try to create 5 threads to cover all 5-digit chunks in my original list, the manual approach below gives the correct answer, with num being the list of single-digit numbers that I parse from the text:
thread1 = pThread(num)
del num[0]
thread2 = pThread(num)
del num[0]
thread3 = pThread(num)
del num[0]
thread4 = pThread(num)
del num[0]
thread5 = pThread(num)
thread1.start()
thread2.start()
thread3.start()
thread4.start()
thread5.start()
thread1.join()
thread2.join()
thread3.join()
thread4.join()
thread5.join()
def max(*args):
result = 0
for i in args:
result = i > result and i or result
return result
print max(thread1.p, thread2.p, thread3.p, thread4.p, thread5.p)
But this doesn't give the correct result:
threads = []
for i in range(0, 4):
tmp = num[:]
del tmp[0:i+1]
thread = pThread(tmp)
thread.start()
threads.append(thread)
for i in range(0, 4):
threads[i].join()
What did I do wrong here? I'm very new to multithreading so please be gentle.

There are 3 problems:
The first is that the "manual" approach does not give the correct answer. It just happens that the correct answer to the problem is at the offset 4 from the start of your list. You can see this by using:
import operator as op
print max(reduce(op.mul, num[i:i+5]) for i in range(1000))
for k in range(5):
print max(reduce(op.mul, num[i:i+5]) for i in range(k, 1000, 5))
One problem with your "manual" approach is that the threads share the num variable, each has the same list. So when you do del num[0], all threadX.l are affected. The fact that you consistently get the same answer is due to the second problem.
The line
for p in chunk_product(num):
should be:
for p in chunk_product(l):
since you want to use the parameter of function greatest_product(l) and not the global variable num.
In the second method you only spawn 4 threads since the loops range over [0, 1, 2, 3]. Also, you want to delete the values tmp[0:i] and not tmp[0:i+1]. Here is the code:
threads = []
for i in range(5):
tmp = num[:]
del tmp[0:i]
thread = pThread(tmp)
thread.start()
threads.append(thread)
for i in range(5):
threads[i].join()
print len(threads), map(lambda th: th.p, threads)
print max(map(lambda th: th.p, threads))

I took a stab at this mainly to get some practice multiprocessing, and to learn how to use argparse.
This took around 4-5 gigs of ram just in case your machine doesn't have a lot.
python euler.py -l 50000000 -n 100 -p 8
Took 5.836833333969116 minutes
The largest product of 100 consecutive numbers is: a very large number
If you type python euler.py -h at the commandline you get:
usage: euler.py [-h] -l L [L ...] -n N [-p P]
Calculates the product of consecutive numbers and return the largest product.
optional arguments:
-h, --help show this help message and exit
-l L [L ...] A single number or list of numbers, where each # is seperated
by a space
-n N A number that specifies how many consecutive numbers should be
multiplied together.
-p P Number of processes to create. Optional, defaults to the # of
cores on the pc.
And the code:
"""A multiprocess iplementation for calculation the maximum product of N consecutive
numbers in a given range (list of numbers)."""
import multiprocessing
import math
import time
import operator
from functools import reduce
import argparse
def euler8(alist,lenNums):
"""Returns the largest product of N consecutive numbers in a given range"""
return max(reduce(operator.mul, alist[i:i+lenNums]) for i in range(len(alist)))
def split_list_multi(listOfNumbers,numLength,threads):
"""Split a list into N parts where N is the # of processes."""
fullLength = len(listOfNumbers)
single = math.floor(fullLength/threads)
results = {}
counter = 0
while counter < threads:
if counter == (threads-1):
temp = listOfNumbers[single*counter::]
if counter == 0:
results[str(counter)] = listOfNumbers[single*counter::]
else:
prevListIndex = results[str(counter-1)][-int('{}'.format(numLength-1))::]
newlist = prevListIndex + temp
results[str(counter)] = newlist
else:
temp = listOfNumbers[single*counter:single*(counter+1)]
if counter == 0:
newlist = temp
else:
prevListIndex = results[str(counter-1)][-int('{}'.format(numLength-1))::]
newlist = prevListIndex + temp
results[str(counter)] = newlist
counter += 1
return results,threads
def worker(listNumbers,number,output):
"""A worker. Used to run seperate processes and put the results in the queue"""
result = euler8(listNumbers,number)
output.put(result)
def main(listOfNums,lengthNumbers,numCores=multiprocessing.cpu_count()):
"""Runs the module.
listOfNums must be a list of ints, or single int
lengthNumbers is N (an int) where N is the # of consecutive numbers to multiply together
numCores (an int) defaults to however many the cpu has, can specify a number if you choose."""
if isinstance(listOfNums,list):
if len(listOfNums) == 1:
valuesToSplit = [i for i in range(int(listOfNums[0]))]
else:
valuesToSplit = [int(i) for i in listOfNums]
elif isinstance(listOfNums,int):
valuesToSplit = [i for i in range(listOfNums)]
else:
print('First arg must be a number or a list of numbers')
split = split_list_multi(valuesToSplit,lengthNumbers,numCores)
done_queue = multiprocessing.Queue()
jobs = []
startTime = time.time()
for num in range(split[1]):
numChunks = split[0][str(num)]
thread = multiprocessing.Process(target=worker, args=(numChunks,lengthNumbers,done_queue))
jobs.append(thread)
thread.start()
resultlist = []
for i in range(split[1]):
resultlist.append(done_queue.get())
for j in jobs:
j.join()
resultlist = max(resultlist)
endTime = time.time()
totalTime = (endTime-startTime)/60
print("Took {} minutes".format(totalTime))
return print("The largest product of {} consecutive numbers is: {}".format(lengthNumbers, resultlist))
if __name__ == '__main__':
#To call the module from the commandline with arguments
parser = argparse.ArgumentParser(description="""Calculates the product of consecutive numbers \
and return the largest product.""")
parser.add_argument('-l', nargs='+', required=True,
help='A single number or list of numbers, where each # is seperated by a space')
parser.add_argument('-n', required=True, type=int,
help = 'A number that specifies how many consecutive numbers should be \
multiplied together.')
parser.add_argument('-p', default=multiprocessing.cpu_count(), type=int,
help='Number of processes to create. Optional, defaults to the # of cores on the pc.')
args = parser.parse_args()
main(args.l, args.n, args.p)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

python MPI for dictionary iteration - python

Related

ProcessPoolExecutor on shared dataset and multiple arguments

MPI spawns all processes on master node

MPI processor quantity creates error, how to implement broadcast?

Parallelization/multiprocessing of conditional for loop

Multi-threading in python with loop

Categories

Resources