Python GIL with Threadpool - python

I have a sample code, to demonstrate python GIL, and the relevant output.
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import ThreadPoolExecutor
import time
from itertools import repeat
from time import sleep
values = [3,4,5,6]
def cube(x, y):
print(f'Cube of {x}:{x*x*x}')
c = []
for i in range(0, 100000):
c.append(i)
d = len(c)
e = c.index(i)
return x*x*x
y = {3: 3}
if __name__ == '__main__':
rres = []
ss = time.time()
with ThreadPoolExecutor(max_workers=5) as executor:
rres= executor.map(cube,values, repeat(y))
print(rres)
for ij in rres:
print(ij)
print(f"Time taken is {time.time() - ss}")
And the output is:
Cube of 3:27
Cube of 4:64Cube of 5:125Cube of 6:216
<generator object Executor.map.<locals>.result_iterator at 0x1103b0f20>
27
64
125
216
Time taken is 234.87321090698242
The timing part of the output is expected, due to python GIL, only one thread is using the CPU at a time, but what I fail to understand is that these lines:
Cube of 3:27
Cube of 4:64Cube of 5:125Cube of 6:216
These appeared simultaneously. I epected these to come in intervals of 60 sec, like sequential counterpart of the code. Can anyone explain this part to me? TIA.

Related

Increase speed by eleminating loops

I have the following problem. The code below successfully linear fits may data from 50 to 400 samples (I never have more than 400 samples and the first 50 are of horrendous quality). In the third dimension I will have the value of 7 and the fourth dimension can have values of up to 10000 therefore this loop "solution" would take alot of time. How can I not use a for loop and decrease my runtimes? Thank you for your help (I am pretty new to Python)
from sklearn.linear_model import TheilSenRegressor
import numpy as np
#ransac = linear_model.RANSACRegressor()
skip_v=50#number of values to be skipped
N=400
test_n=np.reshape(range(skip_v, N),(-1,1))
f_n=7
d4=np.shape(data)
a6=np.ones((f_n,d4[3]))
b6=np.ones((f_n,d4[3]))
for j in np.arange(d4[3]):
for i in np.arange(f_n):
theil = TheilSenRegressor(random_state=0).fit(test_n,np.log(data[skip_v:,3,i,j]))
a6[i,j]=theil.coef_
b6[i,j]=theil.intercept_
You can use multiprocessing to work your loop in parallel. The following code is not working. It just demonstrates how to do it. It is only useful, if your numbers are really big. Otherwise, doing in sequential is faster.
from sklearn.linear_model import TheilSenRegressor
import numpy as np
import multiprocessing as mp
from itertools import product
def worker_function(input_queue, output_queue, skip_v, test_n, data):
for task in iter(input_queue.get, 'STOP'):
i = task[0]
j = task[1]
theil = TheilSenRegressor(random_state=0).fit(test_n,np.log(data[skip_v:,3,i,j]))
output_queue.put([i, j, theil])
if __name__ == "__main__":
# define data here
f_n = 7
d4 = np.shape(data)
skip_v = 50
N=400
test_n=np.reshape(range(skip_v, N),(-1,1))
input_queue = mp.Queue()
output_queue = mp.Queue()
# here you create all combinations of j and i of your loop
list1 = range(f_n)
list2 = range(d4[3])
list3 = [list1, list2]
tasks = [p for p in product(*list3)]
numProc = 4
# start processes
process = [mp.Process(target=worker_function,
args=(input_queue, output_queue,
skip_v, test_n, data)) for x in range(numProc)]
for p in process:
p.start()
# queue tasks
for i in tasks:
input_queue.put(i)
# signal workers to stop after tasks are all done
for i in range(numProc):
input_queue.put('STOP')
# get the results
for i in range(len(tasks)):
res = output_queue.get(block=True) # wait for results
a6[res[0], res[1]] = res[2].coef_
b6[res[0], res[1]] = res[2].intercept_

Python: when is multiprocessing/threading/asyncio even possible? Can my function be calculated in parallel?

I implemented some logic (on Windows 10) creating a vector representing simulated spot prices based on numpy random variables. At the end of the day I need 10, 100 or 1000 of this vector. Using a simple for-loop generating 1000 of these, calculation needs roughly 160 seconds. I tried all possible ways of parallelization, e.g. found here (stackoverflow) and beyond. Some methods did not even work, others had no effect at all. So either:
the implementation of these parallelization tools were wrong ...
... or my function can't be parallelized (because it already uses all threads the the cpu???)
Here is my function (mrm, mp and spm are my custom modules):
def tt(i):
random_variables = np.random.standard_normal((3, I))
mean_reversion_model = mrm.Model(t, m, random_variables, vola, df_regression_param,
df_mean_reversion_param)
year_price_simulation = mp.Simulation(mean_reversion_model, df_initial_price)
year_prices = year_price_simulation.prices()
monthly_prices = mp.MonthlyPrices(year_prices, monthly_factors_file="month_factors_mr.csv",
date_today=date_today, years_to_future=years_to_future, debug=False)
df_S_monthly = monthly_prices.get_monthly_prices()
spot_price_simulation = spm.SpotPrice(jumps, mr_regression, 1., 365, 0.0, df_S_monthly,
verbose=False)
res = spot_price_simulation.get_simulated_spot_prices()
# result_dict[i] = res # only needed using the multiprocessing examples
return res
And these were my attempts (all come after)
if __name__ == '__main__':
multiprocessing
import multiprocessing
N = 10
MAX_WORKERS = 4
t0 = time.time()
pool = multiprocessing.Pool(processes=MAX_WORKERS)
t = pool.map(tt, range(N)) # dictionary in function is used to collect results
pool.close()
pool.join()
t1 = time.time() - t0
print("Elapsed time: {}s".format(t1))
-> won't come back...
multiprocessing.pool
import multiprocessing.pool
N = 100
MAX_WORKERS = 4
t0 = time.time()
with multiprocessing.pool.ThreadPool(processes=MAX_WORKERS) as pool:
t = pool.map(tt, range(N)) # dictionary in function is used to collect results
t1 = time.time() - t0
print("Elapsed time: {}s".format(t1))
-> no improvement, same calculation time as for-loop
concurrent.futures
import concurrent.futures
N = 100
result_dict = dict().fromkeys(range(N))
MAX_WORKERS = 4
t0 = time.time()
with concurrent.futures.ThreadPoolExecutor() as executor:
for idx, out in enumerate(executor.map(tt, range(N))):
result_dict[idx] = out
t1 = time.time() - t0
print("Elapsed time: {}s".format(t1))
-> no improvement, same calculation time as for-loop
asyncio
import asyncio
def background(f):
def wrapped(*args, **kwargs):
return asyncio.get_event_loop().run_in_executor(None, f, *args, **kwargs)
return wrapped
N = 10
result_dict = dict().fromkeys(range(N))
MAX_WORKERS = 4
t0 = time.time()
for i in range(N):
result_dict[i] = tt()
t1 = time.time() - t0
print("Elapsed time: {}s".format(t1))
-> Error: asyncio await wasn't used with future
numpy apply method
import numpy as np
N = 100
test = np.zeros((N, 1))
t0 = time.time()
res = np.apply_along_axis(tt, 1, test)
t1 = time.time() - t0
print("Elapsed time: {}s".format(t1))
-> no improvement
Multithreading doesn't really work for Python code because of the global interpreter lock: only one thread at a time can manipulate Python objects. You can use multithreading for calls to non-Python functions.
So you have to use multiprocessing.Pool instead. But in Windows, you must make your main code block conditional, like this:
if __name__ == '__main__':
pool = multiprocessing.Pool(processes=MAX_WORKERS)
...etc...
otherwise, each worker will also attempt to startup a Pool and your system will hang. In Linux, this is not necessary, because it handles the creation of workers differently.
Edit: it seems that you did that.
Another thing to be aware of is that numpy in Anaconda uses multi-threaded Intel MKL routines for many numpy and scipy functions - especially the ones operating on large arrays. In that case, attempting to do multithreading or multiprocessing will be counterproductive.

Parallelizing through Multi-threading and Multi-processing taking significantly more time than serial

I'm trying to learn how to do parallel programming in python. I wrote a simple int square function and then ran it in serial, multi-thread, and multi-process:
import time
import multiprocessing, threading
import random
def calc_square(numbers):
sq = 0
for n in numbers:
sq = n*n
def splita(list, n):
a = [[] for i in range(n)]
counter = 0
for i in range(0,len(list)):
a[counter].append(list[i])
if len(a[counter]) == len(list)/n:
counter = counter +1
continue
return a
if __name__ == "__main__":
random.seed(1)
arr = [random.randint(1, 11) for i in xrange(1000000)]
print "init completed"
start_time2 = time.time()
calc_square(arr)
end_time2 = time.time()
print "serial: " + str(end_time2 - start_time2)
newarr = splita(arr,8)
print 'split complete'
start_time = time.time()
for i in range(8):
t1 = threading.Thread(target=calc_square, args=(newarr[i],))
t1.start()
t1.join()
end_time = time.time()
print "mt: " + str(end_time - start_time)
start_time = time.time()
for i in range(8):
p1 = multiprocessing.Process(target=calc_square, args=(newarr[i],))
p1.start()
p1.join()
end_time = time.time()
print "mp: " + str(end_time - start_time)
Output:
init completed
serial: 0.0640001296997
split complete
mt: 0.0599999427795
mp: 2.97099995613
However, as you can see, something weird happened and mt is taking the same time as serial and mp is actually taking significantly longer (almost 50 times longer).
What am I doing wrong? Could someone push me in the right direction to learn parallel programming in python?
Edit 01
Looking at the comments, I see that perhaps the function not returning anything seems pointless. The reason I'm even trying this is because previously I tried the following add function:
def addi(numbers):
sq = 0
for n in numbers:
sq = sq + n
return sq
I tried returning the addition of each part to a serial number adder, so at least I could see some performance improvement over a pure serial implementation. However, I couldn't figure out how to store and use the returned value, and that's the reason I'm trying to figure out something even simpler than that, which is just dividing up the array and running a simple function on it.
Thanks!
I think that multiprocessing takes quite a long time to create and start each process. I have changed the program to make 10 times the size of arr and changed the way that the processes are started and there is a slight speed-up:
(Also note python 3)
import time
import multiprocessing, threading
from multiprocessing import Queue
import random
def calc_square_q(numbers,q):
while q.empty():
pass
return calc_square(numbers)
if __name__ == "__main__":
random.seed(1) # note how big arr is now vvvvvvv
arr = [random.randint(1, 11) for i in range(10000000)]
print("init completed")
# ...
# other stuff as before
# ...
processes=[]
q=Queue()
for arrs in newarr:
processes.append(multiprocessing.Process(target=calc_square_q, args=(arrs,q)))
print('start processes')
for p in processes:
p.start() # even tho' each process is started it waits...
print('join processes')
q.put(None) # ... for q to become not empty.
start_time = time.time()
for p in processes:
p.join()
end_time = time.time()
print("mp: " + str(end_time - start_time))
Also notice above how I create and start the processes in two different loops, and then finally join with the processes in a third loop.
Output:
init completed
serial: 0.53214430809021
split complete
start threads
mt: 0.5551605224609375
start processes
join processes
mp: 0.2800724506378174
Another factor of 10 increase in size of arr:
init completed
serial: 5.8455305099487305
split complete
start threads
mt: 5.411392450332642
start processes
join processes
mp: 1.9705185890197754
And yes, I've also tried this in python 2.7, although Threads seemed slower.

python multiprocessing handling an array of numbers concurrently

I have a list of number:
a=[1,2,3,4,5,.....2000]
I have to square each number and update the same array, but instead of writing a loop i want to do it using parallel processing.
So squaring each number in the array becomes a process in itself.
Expected output=[1,3,9,16,25,........]
How can i achieve this with python multiprocessing library?
Already tried to Use threading library but the code is not fast enough, plus Threading library is not using all the cores.
You can use Pool class from the multiprocessing module
from multiprocessing import Pool
def f(x):
return x*x
if __name__ == '__main__':
p = Pool(5)
print(p.map(f, [1, 2, 3]))
#prints [1, 4, 9]
In this numpy would be handy because it works on Matrix methods to calculate. Here is the piece of code that can serve the purpose. In case you want to parallel it you can use the Pool function as stated
import numpy as np
def Square(data):
data_np = np.array(data) ** 2
print (data_np)
Square([1, 2, 3])
You can try ProcessPoolExecutor in concurrent.futures module. Example code:
from time import time
from concurrent.futures import ProcessPoolExecutor
def gcd(pair):
a, b = pair
low = min(a, b)
for i in range(low, 0, -1):
if a % i == 0 and b % i == 0:
return i
numbers = [(1963309, 2265973), (2030677, 3814172),
(1551645, 2229620), (2039045, 2020802)]
start = time()
results = list(map(gcd, numbers))
end = time()
print('1st Took %.3f seconds' % (end - start))
start = time()
pool = ProcessPoolExecutor(max_workers=2)
results = list(pool.map(gcd, numbers))
end = time()
print('2nd Took %.3f seconds' % (end - start))

Slow parallelized loop in Python

I am trying to parallelize a simple for loop in Python following the instructions that I have found on internet.
Here is my code:
import time
from joblib import Parallel, delayed
import multiprocessing
n = 100000
k = 3
num_cores = multiprocessing.cpu_count()
def processInput(i):
return i**2
#This should be the parallelized loop
if __name__ == '__main__':
start = time.time()
results = Parallel(n_jobs=num_cores)(delayed(processInput)(i) for i in range(n))
print("Time parallelized loop:")
print(time.time()-start)
The time required to run this script is the following:
Time parallelized loop:
5.14900016785
The very strange fact is that if I run a sequential loop, the time is much smaller. This is the sequential loop:
import time
from joblib import Parallel, delayed
import multiprocessing
n = 100000
k = 3
num_cores = multiprocessing.cpu_count()
def processInput(i):
return i**2
#This should be the squential loop
start = time.time()
results = [processInput(i) for i in range(n)]
print("Time Sequential loop:")
print(time.time() - start)
I get this time:
Time Sequential loop:
0.107000112534
At this point it is clear that I am missing something. How is it possible that the sequential loop is so faster than the parallelized one?

Categories