Nested dask delayed or futures - python

Looking for best practice for nested parallel jobs. I couldn't nest dask delayed or futures so I mixed both to get it to work. Is this not recommended? Is there better way to do this? Example:
import dask
from dask.distributed import Client
import random
import time
client = Client()
def rndSeries(x):
time.sleep(1)
return random.sample(range(1, 50), x)
def sqNum(x):
time.sleep(1)
return x**2
def subProcess(li):
results=[]
for i in li:
r = dask.delayed(sqNum)(i)
results.append(r)
return dask.compute(sum(results))[0]
futures=[]
for i in range(10):
x = client.submit(rndSeries,random.randrange(5,10,1))
y = client.submit(subProcess, x)
futures.append(y)
client.gather(futures)

Consider modification of your script to have a deterministic workflow. If you start with 1 worker, you will see that the process completes in 20 seconds (as expected, 2 processes of 1 second + 6 processes of 3 seconds). If you have 2 workers, the execution time will drop to 10 seconds.
import dask
from dask.distributed import Client, LocalCluster
import time
import numpy as np
cluster = LocalCluster(n_workers=1, threads_per_worker=1)
client = Client(cluster)
# if inside jupyter split the code below into a new cell
# to see accurate timing
%%time
def rndSeries(x):
time.sleep(1)
return np.random.rand()
def sqNum(x):
time.sleep(3)
return 1
def subProcess(li):
results=[]
li = [1,2,3]
for i in li:
r = dask.delayed(sqNum)(i)
results.append(r)
return dask.compute(sum(results))[0]
futures=[]
for i in range(2):
x = client.submit(rndSeries, np.random.rand())
y = client.submit(subProcess, x)
futures.append(y)
client.gather(futures)
What happens if you have 6 workers? Execution time is now 4 seconds (the lowest possible for this task), so it seems that the only drawback of dask.compute() inside a future version is that it forces the results of delayeds to be on a single worker. This is probably OK in many cases, however, if the combined resource requirements of all delayed tasks exceed resources of a single worker, then the best way to proceed is to submit tasks from tasks: https://distributed.dask.org/en/latest/task-launch.html

Related

How to wait for the worker processes in Python multiprocessing.pool.Pool without closing it?

I'm benchmarking this script on a 6-core CPU with Ubuntu 22.04.1 and Python 3.10.6. It is supposed to show usage of all available CPU cores with par function vs. a single core with ser function.
import numpy as np
from multiprocessing import Pool
import timeit as ti
def foo(n):
return -np.sort(-np.arange(n))[-1]
def par(reps, bigNum, pool):
for i in range(bigNum, bigNum+reps):
pool.apply_async(foo, args=(i,))
def ser(reps, bigNum):
for i in range(bigNum, bigNum+reps):
foo(i)
if __name__ == '__main__':
bigNum = 9_000_000
reps = 6
fun = f'par(reps, bigNum, pool)'
t = 1000 * np.array(ti.repeat(stmt=fun, setup='pool=Pool(reps);'+fun, globals=globals(), number=1, repeat=10))
print(f'{fun}: {np.amin(t):6.3f}ms {np.median(t):6.3f}ms')
fun = f'ser(reps, bigNum)'
t = 1000 * np.array(ti.repeat(stmt=fun, setup=fun, globals=globals(), number=1, repeat=10))
print(f'{fun}: {np.amin(t):6.3f}ms {np.median(t):6.3f}ms')
Right now, par function only shows the time to spin the worker processes. What do I need to change in function par, in order to make it wait for all worker processes to complete before returning? Note that I would like to reuse the process pool between calls.
you need to get the result from apply_async to wait for it.
def par(reps, bigNum, pool):
jobs = []
for i in range(bigNum, bigNum+reps):
jobs.append(pool.apply_async(foo, args=(i,)))
for job in jobs:
job.get()
for long loops you should be using map or imap or imap_unordered instead of apply_async as it has less overhead and you get to control the chunksize for faster serialization of small objects, and you can pass generators to them to save memory or allow infinite generators (with imap).
def par(reps, bigNum, pool):
pool.map(foo, range(bigNum,bigNum+reps), chunksize=1)
note: python PEP8 indentation is 4 spaces, not 2.

parallel processing for apply function

I have a dataframe of 100,000 records so i tried to do a Parallel processing using the joblib library which works fine with my code below, but my question is can i try the same code with 'apply' and 'lambda' function which seems like very close to my original code with minimum change instead of using the for loop like in my code. Please help
Original Code - Without parallel processing:
df['b1'] = df.text1.apply(lambda x: removeNumbers(x))
With parallel processing:
For the purpose of applying the Joblib's parallel processing i converted to for loop below
df['b1'] = Parallel(n_jobs = -1)(delayed(removeNumbers)(x) for x in df.text1)
I have the following code which I use when I have a large dataframe and want to use parallel computing:
import numpy as np
import pandas as pd
import time
from multiprocessing import Pool, cpu_count
from functools import partial
# Wrapper to time functions (not needed for parallel computing but to show that it works...)
def time_function(func):
def decorated_func(*args, **kwargs):
start = time.perf_counter_ns()
ret = func(*args, **kwargs)
stop = time.perf_counter_ns()
temp = []
temp += [type(a) for a in args]
f = lambda x: f"{x}={type(kwargs[x])}"
temp += list(map(f, kwargs))
print(f"Function {func.__name__}{*temp,}: time elapsed: {(stop - start)*1e-6:.3f} [ms]")
return ret
return decorated_func
# This function splits the data and calls the functions.
def parallelize(data, func, num_of_processes=cpu_count()):
data_split = np.array_split(data, num_of_processes)
p = pool(num_of_processes)
data = pd.concat(p.map(func, data_split))
p.close()
p.join()
return data
# This function is only used for pandas (otherwise the parallelize function was enough)
def run_on_subset(func, data_subset):
return data_subset.apply(func, axis=1)
# This function is maybe redundant, but it keeps the code readable.
def parallelize_on_rows(data, func, num_of_processes=8):
return parallelize(data, partial(run_on_subset, func), num_of_processes)
def sum_two_columns(row):
time.sleep(0.1) # Make it a time consuming function
return row[0] + row[1]
#time_function
def oridnary_apply(df):
return df.apply(sum_two_columns, axis=1)
#time_function
def parallel_apply(df):
return parallelize_on_rows(df, sum_two_columns)
if __name__ == '__main__':
array = np.ones((100, 3))
df = pd.DataFrame(array)
print(f"cpu_count: {cpu_count()}")
oridnary_apply(df)
parallel_apply(df)
print('done')
>>> cpu_count: 12
>>> Function oridnary_apply(<class 'pandas.core.frame.DataFrame'>,): time elapsed: 10860.275 [ms]
>>> Function parallel_apply(<class 'pandas.core.frame.DataFrame'>,): time elapsed: 2170.105 [ms]
>>> done
EDIT:
When a lot of values in your rows are equal then it is also possible to cache the your function. If it is a complex function, that means the execution time is relative long, this is also a way to speed up the apply function for your DataFrame.
https://docs.python.org/3/library/functools.html#functools.lru_cache

Python pool.map function completes but leaves zombies

I've been having an issue where pool.map leaves processes even after pool.terminate is called. I've looked for solutions but they all seems to have some other issue like recursively calling the map function or another process that interferes with the multiprocessing.
So my code imports 2 NETCDF files and processes the data in them using different calculations. These take up a lot of time (several 6400x6400 arrays) so I tried to multi process my code. The multiprocessing works and the first time I run my code it takes 2.5 minutes (down from 8), but every time my code finishes running the memory usage by Spyder never goes back down and it leaves extra python processes in the Windows task manager. My code looks like this:
import numpy as np
import netCDF4
import math
from math import sin, cos
import logging
from multiprocessing.pool import Pool
import time
start=time.time()
format = "%(asctime)s: %(message)s"
logging.basicConfig(format=format, level=logging.INFO, datefmt="%H:%M:%S")
logging.info("Here we go!")
path = "DATAPATH"
geopath = "DATAPATH"
f = netCDF4.Dataset(path)
f.set_auto_maskandscale(False)
f2 = netCDF4.Dataset(geopath)
i5lut=f.groups['observation_data'].variables['I05_brightness_temperature_lut'][:]
i4lut=f.groups['observation_data'].variables['I05_brightness_temperature_lut'][:]
I5= f.groups['observation_data'].variables['I05'][:]
I4= f.groups['observation_data'].variables['I04'][:]
I5=i5lut[I5]
I4=i4lut[I4]
I4Quality= f.groups['observation_data'].variables['I04_quality_flags'][:]
I5Quality= f.groups['observation_data'].variables['I05_quality_flags'][:]
I3= f.groups['observation_data'].variables['I03']
I2= f.groups['observation_data'].variables['I02']
I1= f.groups['observation_data'].variables['I01']
I1.set_auto_scale(True)
I2.set_auto_scale(True)
I3.set_auto_scale(True)
I1=I1[:]
I2=I2[:]
I3=I3[:]
lats = f2.groups['geolocation_data'].variables['latitude'][:]
lons = f2.groups['geolocation_data'].variables['longitude'][:]
solarZen = f2.groups['geolocation_data'].variables['solar_zenith'][:]
sensorZen= solarZen = f2.groups['geolocation_data'].variables['sensor_zenith'][:]
solarAz = f2.groups['geolocation_data'].variables['solar_azimuth'][:]
sensorAz= solarZen = f2.groups['geolocation_data'].variables['sensor_azimuth'][:]
def kernMe(i, j, band):
if i<250 or j<250:
return -1
else:
return np.mean(band[i-250:i+250:1,j-250:j+250:1])
def thread_me(arr):
start1=arr[0]
end1=arr[1]
start2=arr[2]
end2=arr[3]
logging.info("Im starting at: %d to %d, %d to %d" %(start1, end1, start2, end2))
points = []
avg = np.mean(I4)
for i in range(start1,end1):
for j in range (start2,end2):
if solarZen[i,j]>=90:
if not (I5[i,j]<265 and I4[i,j]<295):#
if I4[i,j]>320 and I4Quality[i,j]==0:
points.append([lons[i,j],lats[i,j], 1])
elif I4[i,j]>300 and I5[i,j]-I4[i,j]>10:
points.append([lons[i,j],lats[i,j], 2])
elif I4[i,j] == 367 and I4Quality ==9:
points.append([lons[i,j],lats[i,j, 3]])
else:
if not ((I1[i,j]>I2[i,j]>I3[i,j]) or (I5[i,j]<265 or (I1[i,j]+I2[i,j]>0.9 and I5[i,j]<295) or
(I1[i,j]+I2[i,j]>0.7 and I5[i,j]<285))):
if not (I1[i,j]+I2[i,j] > 0.6 and I5[i,j]<285 and I3[i,j]>0.3 and I3[i,j]>I2[i,j] and I2[i,j]>0.25 and I4[i,j]<=335):
thetaG= (cos(sensorZen[i,j]*(math.pi/180))*cos(solarZen[i,j]*(math.pi/180)))-(sin(sensorZen[i,j]*(math.pi/180))*sin(solarZen[i,j]*(math.pi/180))*cos(sensorAz[i,j]*(math.pi/180)))
thetaG= math.acos(thetaG)*(180/math.pi)
if not ((thetaG<15 and I1[i,j]+I2[i,j]>0.35) or (thetaG<25 and I1[i,j]+I2[i,j]>0.4)):
if math.floor(I4[i,j])==367 and I4Quality[i,j]==9 and I5>290 and I5Quality[i,j]==0 and (I1[i,j]+I2[i,j])>0.7:
points.append([lons[i,j],lats[i,j, 4]])
elif I4[i,j]-I5[i,j]>25 or True:
kern = kernMe(i, j, I4)
if kern!=-1 or True:
BT4M = max(325, kern)
kern = min(330, BT4M)
if I4[i,j]> kern and I4[i,j]>avg:
points.append([lons[i,j],lats[i,j], 5])
return points
if __name__ == '__main__':
#Separate the arrays into 1616*1600 chunks for multi processing
#TODO: make this automatic, not hardcoded
arg=[[0,1616,0,1600],[0,1616,1600,3200],[0,1616,3200,4800],[0,1616,4800,6400],
[1616,3232,0,1600],[1616,3232,1600,3200],[1616,3232,3200,4800],[1616,3232,4800,6400],
[3232,4848,0,1600],[3232,4848,1600,3200],[3232,4848,3200,4800],[3232,4848,4800,6400],
[4848,6464,0,1600],[4848,6464,1600,3200],[4848,6464,3200,4800],[4848,6464,4800,6400]]
print(arg)
p=Pool(processes = 4)
output= p.map(thread_me, arg)
p.close()
p.join()
print(output)
f.close()
f2.close()
logging.info("Aaaand we're here!")
print(str((time.time()-start)/60))
p.terminate()
I use both p.close and p. terminate because I thought it would help (it doesn't). All of my code runs and produces the expected output but I have to manually end the lingering processes using the task manager. Any ideas as to
what's causing this?
I think I put all the relevant information here, if you need more I'll edit with the requests
Thanks in advance.

How to parallelize a nested for loop in python?

Ok, here is my problem: I have a nested for loop in my program which runs on a single core. Since the program spend over 99% of run time in this nested for loop I would like to parallelize it. Right now I have to wait 9 days for the computation to finish. I tried to implement a parallel for loop by using the multiprocessing library. But I only find very basic examples and can not transfer them to my problem. Here are the nested loops with random data:
import numpy as np
dist_n = 100
nrm = np.linspace(1,10,dist_n)
data_Y = 11000
data_I = 90000
I = np.random.randn(data_I, 1000)
Y = np.random.randn(data_Y, 1000)
dist = np.zeros((data_I, dist_n)
for t in range(data_Y):
for i in range(data_I):
d = np.abs(I[i] - Y[t])
for p in range(dist_n):
dist[i,p] = np.sum(d**nrm[p])/nrm[p]
print(dist)
Please give me some advise how to make it parallel.
There's a small overhead with initiating a process (50ms+ depending on data size) so it's generally best to MP the largest block of code possible. From your comment it sounds like each loop of t is independent so we should be free to parallelize this.
When python creates a new process you get a copy of the main process so you have available all your global data but when each process writes the data, it writes to it's own local copy. This means dist[i,p] won't be available to the main process unless you explicitly pass it back with a return (which will have some overhead). In your situation, if each process writes dist[i,p] to a file then you should be fine, just don't try to write to the same file unless you implement some type of mutex access control.
#!/usr/bin/python
import time
import multiprocessing as mp
import numpy as np
data_Y = 11 #11000
data_I = 90 #90000
dist_n = 100
nrm = np.linspace(1,10,dist_n)
I = np.random.randn(data_I, 1000)
Y = np.random.randn(data_Y, 1000)
dist = np.zeros((data_I, dist_n))
def worker(t):
st = time.time()
for i in range(data_I):
d = np.abs(I[i] - Y[t])
for p in range(dist_n):
dist[i,p] = np.sum(d**nrm[p])/nrm[p]
# Here - each worker opens a different file and writes to it
print 'Worker time %4.3f mS' % (1000.*(time.time()-st))
if 1: # single threaded
st = time.time()
for x in map(worker, range(data_Y)):
pass
print 'Single-process total time is %4.3f seconds' % (time.time()-st)
print
if 1: # multi-threaded
pool = mp.Pool(28) # try 2X num procs and inc/dec until cpu maxed
st = time.time()
for x in pool.imap_unordered(worker, range(data_Y)):
pass
print 'Multiprocess total time is %4.3f seconds' % (time.time()-st)
print
If you re-increase the size of data_Y/data_I again, the speed-up should increase up to the theoretical limit.

Python solve LPs in parallel

I have a list of LPs which I want to solve in parallel.
So far I have tried both multiprocessing and joblib. But both use only 1 CPU (out of 8).
My code
import subprocess
from multiprocessing import Pool, cpu_count
from scipy.optimize import linprog
import numpy as np
from joblib import Parallel, delayed
def is_in_convex_hull(arg):
A,v = arg
res = linprog(np.zeros(A.shape[1]),A_eq = A,b_eq = v)
return res['success']
def convex_hull_LP(A):
pool = Pool(processes = cpu_count())
res = pool.map(is_in_convex_hull,[(np.delete(A,i,axis=1),A[:,i]) for i in range(A.shape[1])])
pool.close()
pool.join()
return [i for i in range(A.shape[1]) if not res[i]]
Now in IPyton I run
A = np.random.randint(0,60,size = (40,300))
%time l1 = convex_hull_LP(A)
%time l2 = Parallel(n_jobs=8)(delayed(is_in_convex_hull)((np.delete(A,i,axis=1),A[:,i])) for i in range(A.shape[1]))
which both result in about 7 seconds, but using only a single CPU, although 8 different process-IDs are shown.
Other Threads
With the answer from Python multiprocessing.Pool() doesn't use 100% of each CPU I got 100% on all, but I think an LP is complicated enough, to be the bottleneck.
I couldn't make sense of Multiprocess in python uses only one process
My Questions
How can I split the jobs over all available CPUs?
Or is it even possible to run this on the GPU?

Categories