Python Multithreaded Matrix Multiplication

Python Multithreaded Matrix Multiplication - python

I am doing a project for the end of the semester and I need to be able to take a matrix to a power and I need to make the problem multithreaded.
This code works in some situations and does not in other situations. I believe that it has to do with the logic in the nested loops in the process_data function but I am not sure what I am doing wrong! I have been working on this for a couple weeks and I am absolutely stumped. It seems like it has something to do with my threads going out of bounds but even then I am not very sure because there are some situations where the threads go out of bounds but then still calculates the matrices properly.
Please help!
import copy
import numpy
import Queue
import random
import threading
import time
import timeit
# Create variable that determines the number of columns and
# rows in the matrix.
n = 4
# Create variable that determines the power we are taking the
# matrix to.
p = 2
# Create variable that determines the number of threads we are
# using.
t = 2
# Create an exit flag.
exitFlag = 0
# Create threading class.
class myThread (threading.Thread):
def __init__(self, threadID, name, q):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.q = q
def run(self):
print "Starting " + self.name
process_data(self.name, self.q)
print "Exiting " + self.name
# Create a function that will split our data into multiple threads
# and do the matrix multiplication.
def process_data(threadName, q):
numCalc = ((n^3)/t)
for a in range(p-1):
for b in range((numCalc*(q-1)),(numCalc*(q))):
for c in range(n):
for d in range(n):
matrix[a+1][b][c] += matrix[a][b][d] * matrix[0][d][c]
# Create a three dimensional matrix that will store the ouput for
# each power of the matrix multiplication.
matrix = [[[0 for k in xrange(n)] for j in xrange(n)] for i in xrange(p)]
print matrix
# This part fills our initial n by n matrix with random numbers
# ranging from 0 to 9 and then prints it!
print "Populating Matrix!"
for i in range(n):
for j in range(n):
matrix[0][i][j] = random.randint(0,9)
# Tells the user that we are multiplying matrices and starts the
# timer.
print "Taking our matrix to the next level!"
start = timeit.default_timer()
threadLock = threading.Lock()
threads = []
threadID = 1
# Create new threads
for tName in range(t):
thread = myThread(threadID, "Thread-0"+str(tName), threadID)
thread.start()
threads.append(thread)
threadID += 1
# Wait for all threads to complete
for x in threads:
x.join()
stop = timeit.default_timer()
print stop - start
print "Exiting main thread!"
print matrix
Taking the matrix squared seems to work in every case but if I try to calculate beyond that the remaining powers come out with matrices that are filled with zeroes! The case that I have posted works.
When I change the n, p and t variables is when I run into problems where it does not calculate properly.
Thank you for your time.

This is not correct:
numCalc = ((n^3)/t)
for b in range((numCalc*(q-1)),(numCalc*(q))):
For instance, when n = 4 and t = 2, the first thread should have b range over the columns [0,1] and the second thread range over the columns [2,3]. But this calculation gives:
numCalc = 8 / 2 = 4
thread 1 ranges b over range(0, 4) = [0,1,2,3]
thread 2 ranges b over range(4, 8) = [4,5,6,7]
So thread 1 does all of the work and thread 2 tries to access non-existent columns!

Related

Python multiprocessing: how to create x number of processes and get return value back

I have a program that I created using threads, but then I learned that threads don't run concurrently in python and processes do. As a result, I am trying to rewrite the program using multiprocessing, but I am having a hard time doing so. I have tried following several examples that show how to create the processes and pools, but I don't think it's exactly what I want.
Below is my code with the attempts I have tried. The program tries to estimate the value of pi by randomly placing points on a graph that contains a circle. The program takes two command-line arguments: one is the number of threads/processes I want to create, and the other is the total number of points to try placing on the graph (N).
import math
import sys
from time import time
import concurrent.futures
import random
import multiprocessing as mp
def myThread(arg):
# Take care of imput argument
n = int(arg)
print("Thread received. n = ", n)
# main calculation loop
count = 0
for i in range (0, n):
x = random.uniform(0,1)
y = random.uniform(0,1)
d = math.sqrt(x * x + y * y)
if (d < 1):
count = count + 1
print("Thread found ", count, " points inside circle.")
return count;
# end myThread
# receive command line arguments
if (len(sys.argv) == 3):
N = sys.argv[1] # original ex: 0.01
N = int(N)
totalThreads = sys.argv[2]
totalThreads = int(totalThreads)
print("N = ", N)
print("totalThreads = ", totalThreads)
else:
print("Incorrect number of arguments!")
sys.exit(1)
if ((totalThreads == 1) or (totalThreads == 2) or (totalThreads == 4) or (totalThreads == 8)):
print()
else:
print("Invalid number of threads. Please use 1, 2, 4, or 8 threads.")
sys.exit(1)
# start experiment
t = int(time() * 1000) # begin run time
total = 0
# ATTEMPT 1
# processes = []
# for i in range(totalThreads):
# process = mp.Process(target=myThread, args=(N/totalThreads))
# processes.append(process)
# process.start()
# for process in processes:
# process.join()
# ATTEMPT 2
#pool = mp.Pool(mp.cpu_count())
#total = pool.map(myThread, [N/totalThreads])
# ATTEMPT 3
#for i in range(totalThreads):
#total = total + pool.map(myThread, [N/totalThreads])
# p = mp.Process(target=myThread, args=(N/totalThreads))
# p.start()
# ATTEMPT 4
# with concurrent.futures.ThreadPoolExecutor() as executor:
# for i in range(totalThreads):
# future = executor.submit(myThread, N/totalThreads) # start thread
# total = total + future.result() # get result
# analyze results
pi = 4 * total / N
print("pi estimate =", pi)
delta_time = int(time() * 1000) - t # calculate time required
print("Time =", delta_time, " milliseconds")
I thought that creating a loop from 0 to totalThreads that creates a process for each iteration would work. I also wanted to pass in N/totalThreads (to divide the work), but it seems that processes take in an iterable list rather than an argument to pass to the method.
What is it I am missing with multiprocessing? Is it at all possible to even do what I want to do with processes?
Thank you in advance for any help, it is greatly appreciated :)

I have simplified your code and used some hard-coded values which may or may not be reasonable.
import math
import concurrent.futures
import random
from datetime import datetime
def myThread(arg):
count = 0
for i in range(0, arg[0]):
x = random.uniform(0, 1)
y = random.uniform(0, 1)
d = math.sqrt(x * x + y * y)
if (d < 1):
count += 1
return count
N = 10_000
T = 8
_start = datetime.now()
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = {executor.submit(myThread, (int(N / T),)): _ for _ in range(T)}
total = 0
for future in concurrent.futures.as_completed(futures):
total += future.result()
_end = datetime.now()
print(f'Estimate for PI = {4 * total / N}')
print(f'Run duration = {_end-_start}')
A typical output on my machine looks like this:-
Estimate for PI = 3.1472
Run duration = 0:00:00.008895
Bear in mind that the number of threads you start is effectively managed by the ThreadPoolExecutor (TPE) [ when constructed with no parameters ]. It makes decisions about the number of threads that can run based on your machine's processing capacity (number of cores etc). Therefore you could, if you really wanted to, set T to a very high number and the TPE will block execution of any new threads until it determines that there is capacity.

shared memory between processes

I'm playing around with the multiprocessing module in python and trying to parallelize an algorithm that loops through an list with a different increment value each time (modification of the Sieve of Eratosthenes algorithm). Therefore, I want to have a shared list between all of the processes so that all the processes are modifying the same list. I've tried with the multiprocessing.Array function, but when I reach the end of the program the array is still unmodified and still contains all 0's (the value that I initialized it to).
import multiprocessing
import math
num_cores = multiprocessing.cpu_count()
lower = 0
mark = None
def mark_array(k):
global mark
index = (-(-lower//k)*k)-lower
for i in range(index, len(mark), k):
mark[i] = 1
def sieve(upper_bound, lower_bound):
size = upper_bound - lower_bound + 1
global mark
mark = multiprocessing.Array('i', size, lock=False)
for i in range(size):
mark[i] = 0
klimit = int(math.sqrt(upper_bound)) + 1
global lower
lower = lower_bound
if __name__ == '__main__':
pool = multiprocessing.Pool(processes=num_cores)
inputs = list(range(2, klimit+1))
pool.map(mark_array, inputs)
pool.close()
pool.join()
result = []
for i in range(size):
result.append(mark[i])
print(result)
sieve(200,100)
Pardon the code. It's a bit messy, but I'm just trying to get the shared memory to work before I clean it up.
EDIT: Ok, so I tried the exact same code on a linux machine and there I get my expected output. However, running the same code in VS code on a Windows machine does not. Any idea why?
EDIT#2: This seems to be a Windows specific issue as the Windows OS handles processes differently than Linux. If this is the case, any idea how to solve it?

You could try to use multiprocessing.Manager for your task:
import multiprocessing
import math
from functools import partial
num_cores = multiprocessing.cpu_count()
lower = 0
def mark_array(mark, k):
index = (-(-lower // k) * k) - lower
for i in range(index, len(mark), k):
mark[i] = 1
def sieve(upper_bound, lower_bound):
size = upper_bound - lower_bound + 1
klimit = int(math.sqrt(upper_bound)) + 1
global lower
lower = lower_bound
if __name__ == '__main__':
pool = multiprocessing.Pool(processes=num_cores)
with multiprocessing.Manager() as manager:
mark = manager.list(range(size))
for i in range(size):
mark[i] = 0
inputs = list(range(2, klimit + 1))
foo = partial(mark_array, mark)
pool.map(foo, inputs)
pool.close()
pool.join()
result = []
for i in range(size):
result.append(mark[i])
print(result)
sieve(200, 100)

Trying to optimize my complex function to excute in a polynomial time

I have this code that generate all the 2**40 possible binary numbers, and from this binary numbers, i will try to get all the vectors that match my objectif function conditions which is:
1- each vector in the matrix must have 20 of ones(1).
2- the sum of s = s + (the index of one +1)* the rank of the one must equal 4970.
i wrote this code but it will take a lot of time maybe months, to give the results. Now, i am looking for an alternative way or an optimization of this code if possible.
import time
from multiprocessing import Process
from multiprocessing import Pool
import numpy as np
import itertools
import numpy
CC = 20
#test if there is 20 numbers of 1
def test1numebers(v,x=1,x_l=CC):
c = 0
for i in range(len(v)):
if(v[i]==x):
c+=1
if c == x_l:
return True
else:
return False
#s = s+ the nth of 1 * (index+1)
def objectif_function(v,x=1):
s = 0
for i in range(len(v)):
if(v[i]==x):
s = s+((i+1)*nthi(v,i))
return s
#calculate the nth of 1 in a vecteur
def nthi(v,i):
c = 0
for j in range(0,i+1):
if(v[j] == 1):
c+=1
return c
#generate 2**40 of all possible binray numbers
def generateMatrix(N):
l = itertools.product([0, 1], repeat=N)
return l
#function that get the number of valide vector that match our objectif function
def main_algo(N=40,S=4970):
#N = 40
m = generateMatrix(N)
#S = 4970
c = 0
ii = 0
for i in m:
ii+=1
print("\n count:",ii)
xx = i
if(test1numebers(xx)):
if(objectif_function(xx)==S):
c+=1
print('found one')
print('\n',xx,'\n')
if ii>=1000000:
break
t_end = time.time()
print('time taken for 10**6 is: ',t_end-t_start)
print(c)
#main_algo()
if __name__ == '__main__':
'''p = Process(target=main_algo, args=(40,4970,))
p.start()
p.join()'''
p = Pool(150)
print(p.map(main_algo, [40,4970]))

While you could make a lot of improvements in readability and make your code more pythonic.
I recommend that you use numpy which is the fastest way of working with matrixes.
Avoid working with matrixes on a "pixel by pixel" loop. With numpy you can make those calculations faster and with all the data at once.
Also numpy has support for generating matrixes really fast. I think that you could make a random [0,1] matrix in less lines of code and quite faster.
Also i recommend that you install OPENBLAS, ATLAS and LAPACK which make linear algebra calculations quite faster.
I hope this helps you.

MPI processor quantity creates error, how to implement broadcast?

I have created a python program to calculate pi. I then decided to write it with mpi4py to run with several processes. The program works, but it returns a different value for pi than the original python version. As I looked into this problem more, I found that it returns a less accurate value when I run it with more processors. Why does the MPI version change the result with more processors? Also would it make more sense to use a broadcast rather then sending lots of individual messages? How would I implement broadcast if it is more effective?
MPI version:
#!/apps/moose/miniconda/bin/python
from mpi4py import MPI
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
name = MPI.Get_processor_name()
def f(x):
return (1-(float(x)**2))**float(0.5)
n = 1000000
nm = dict()
pi = dict()
for i in range(1,size+1):
if i == size:
nm[i] = (i*n/size)+1
else:
nm[i] = i*n/size
if rank == 0:
val = 0
for i in range(0,nm[1]):
val = val+f(float(i)/float(n))
val = val*2
pi[0] = (float(2)/n)*(float(1)+val)
print name, "rank", rank, "calculated", pi[0]
for i in range(1, size):
pi[i] = comm.recv(source=i, tag=i)
number = sum(pi.itervalues())
number = "%.20f" %(number)
import time
time.sleep(0.3)
print "Pi is approximately", number
for proc in range(1, size):
if proc == rank:
val = 0
for i in range(nm[proc]+1,nm[proc+1]):
val = val+f(float(i)/float(n))
val = val*2
pi[proc] = (float(2)/n)*(float(1)+val)
comm.send(pi[proc], dest=0, tag = proc)
print name, "rank", rank, "calculated", pi[proc]
Original Python version:
#!/usr/bin/python
n = 1000000
def f(x):
return (1-(float(x)**2))**float(0.5)
val = 0
for i in range(n):
i = i+1
val = val+f(float(i)/float(n))
val = val*2
pi = (float(2)/n)*(float(1)+val)
print pi

Your code estimates by computing the area of the quarter of a disk, that is the intergral of using the trapezoidal rule.
The problem of your code is that the ranges of the i values for each process are not complete. Indeed, use a small n and print i to see what is happening. For instance, for i in range(nm[proc]+1,nm[proc+1]): must be changed to for i in range(nm[proc],nm[proc+1]):. Otherwise, i=nm[proc] is never handled.
In addition, in pi[0] = (float(2)/n)*(float(1)+val) and pi[proc] = (float(2)/n)*(float(1)+val), the term float(1) comes from x=0 in the integral. But it is counted many times, once by each process! As the number of errors varies directly with the number of processes, increasing the number of processes decreases the accuracy, which is the symptom that you have reported.
A broadcast corresponds to a situation where all processes of a communicator must get the same piece of data from a given process. On the contrary, it is here required that data from all processors must be combined using a sum to produce a result available to a single process (called "root"). The latter operation is called a reduction and it is performed by comm.Reduce().
Here is a piece of code based on yours using comm.Reduce() instead of send() and recv().
from mpi4py import MPI
import numpy as np
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
name = MPI.Get_processor_name()
def f(x):
return (1-(float(x)**2))**float(0.5)
n = 10000000
nm =np.zeros(size+1,'i')
nm[0]=1
for i in range(1,size+1):
if i == size:
nm[i]=n
else:
nm[i] = (i*n)/size
val=0
for i in range(nm[rank],nm[rank+1]):
val = val+f((float(i))/float(n))
out=np.array(0.0, 'd')
vala=np.array(val, 'd')
comm.Reduce([vala,MPI.DOUBLE],[out,MPI.DOUBLE],op=MPI.SUM,root=0)
if rank == 0:
number =(float(4)/n)*(out)+float(2)/n
number = "%.20f" %(number)
import time
time.sleep(0.3)
print "Pi is approximately", number

Parallelization/multiprocessing of conditional for loop

I want to use multiprocessing in Python to speed up a while loop.
More specifically:
I have a matrix (samples*features). I want to select x subsets of samples whose values at a random subset of features is unequal to a certain value (-1 in this case).
My serial code:
np.random.seed(43)
datafile = '...'
df = pd.read_csv(datafile, sep=" ", nrows = 89)
no_feat = 500
no_samp = 5
no_trees = 5
i=0
iter=0
samples = np.zeros((no_trees, no_samp))
features = np.zeros((no_trees, no_feat))
while i < no_trees:
rand_feat = np.random.choice(df.shape[1], no_feat, replace=False)
iter_order = np.random.choice(df.shape[0], df.shape[0], replace=False)
samp_idx = []
a=0
#--------------
#how to run in parallel?
for j in iter_order:
pot_samp = df.iloc[j, rand_feat]
if len(np.where(pot_samp==-1)[0]) == 0:
samp_idx.append(j)
if len(samp_idx) == no_samp:
print a
break
a+=1
#--------------
if len(samp_idx) == no_samp:
samples[i,:] = samp_idx
features[i, :] = rand_feat
i+=1
iter+=1
if iter>1000: #break if subsets cannot be found
break
Searching for fitting samples is the potentially expensive part (the j for loop), which in theory can be run in parallel. In some cases, it is not necessary to iterate over all samples to find a large enough subset, which is why I am breaking out of the loop as soon as the subset is large enough.
I am struggling to find an implementation that would allow for checks of how many valid results are generated already. Is it even possible?
I have used joblib before. If I understand correctly this uses the pool methods of multiprocessing as a backend which only works for separate tasks? I am thinking that queues might be helpful but thus far I failed at implementing them.

I found a working solution. I decided to run the while loop in parallel and have the different processes interact over a shared counter. Furthermore, I vectorized the search for suitable samples.
The vectorization yielded a ~300x speedup and running on 4 cores speeds up the computation ~twofold.
First I tried to implement separate processes and put the results into a queue. Turns out these aren't made to store large amounts of data.
If someone sees another bottleneck in that code I would be glad if someone pointed it out.
With my basically nonexistent knowledge about parallel computing I found it really hard to puzzle this together, especially since the example on the internet are all very basic. I learnt a lot though =)
My code:
import numpy as np
import pandas as pd
import itertools
from multiprocessing import Pool, Lock, Value
from datetime import datetime
import settings
val = Value('i', 0)
worker_ID = Value('i', 1)
lock = Lock()
def findSamp(no_trees, df, no_feat, no_samp):
lock.acquire()
print 'starting worker - {0}'.format(worker_ID.value)
worker_ID.value +=1
worker_ID_local = worker_ID.value
lock.release()
max_iter = 100000
samp = []
feat = []
iter_outer = 0
iter = 0
while val.value < no_trees and iter_outer<max_iter:
rand_feat = np.random.choice(df.shape[1], no_feat, replace=False
#get samples with random features from dataset;
#find and select samples that don't have missing values in the random features
samp_rand = df.iloc[:,rand_feat]
nan_idx = np.unique(np.where(samp_rand == -1)[0])
all_idx = np.arange(df.shape[0])
notnan_bool = np.invert(np.in1d(all_idx, nan_idx))
notnan_idx = np.where(notnan_bool == True)[0]
if notnan_idx.shape[0] >= no_samp:
#if enough samples for random feature subset, select no_samp samples randomly
notnan_idx_rand = np.random.choice(notnan_idx, no_samp, replace=False)
rand_feat_rand = rand_feat
lock.acquire()
val.value += 1
#x = val.value
lock.release()
#print 'no of trees generated: {0}'.format(x)
samp.append(notnan_idx_rand)
feat.append(rand_feat_rand)
else:
#increase iter_outer counter if no sample subset could be found for random feature subset
iter_outer += 1
iter+=1
if iter >= max_iter:
print 'exiting worker{0} because iter >= max_iter'.format(worker_ID_local)
else:
print 'worker{0} - finished'.format(worker_ID_local)
return samp, feat
def initialize(*args):
global val, worker_ID, lock
val, worker_ID, lock = args
def star_findSamp(i_df_no_feat_no_samp):
return findSamp(*i_df_no_feat_no_samp)
if __name__ == '__main__':
np.random.seed(43)
datafile = '...'
df = pd.read_csv(datafile, sep=" ", nrows = 89)
df = df.fillna(-1)
df = df.iloc[:, 6:]
no_feat = 700
no_samp = 10
no_trees = 5000
startTime = datetime.now()
print 'starting multiprocessing'
ncores = 4
p = Pool(ncores, initializer=initialize, initargs=(val, worker_ID, lock))
args = itertools.izip([no_trees]*ncores, itertools.repeat(df), itertools.repeat(no_feat), itertools.repeat(no_samp))
result = p.map(star_findSamp, args)#, callback=log_result)
p.close()
p.join()
print '{0} sample subsets for tree training have been found'.format(val.value)
samples = [x[0] for x in result if x != None]
samples = np.vstack(samples)
features = [x[1] for x in result if x != None]
features = np.vstack(features)
print datetime.now() - startTime

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python Multithreaded Matrix Multiplication - python

Related

Python multiprocessing: how to create x number of processes and get return value back

shared memory between processes

Trying to optimize my complex function to excute in a polynomial time

MPI processor quantity creates error, how to implement broadcast?

Parallelization/multiprocessing of conditional for loop

Categories

Resources