parallelizing for loop in class with joblib: pickable error

parallelizing for loop in class with joblib: pickable error - python

I am trying to parallize the for loop inside the active_again function but I get this error message:
BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.
When I use the for loop which is commented out underneath the parallel line it is working fine. But at some point the code will have to work with larger numbers of C_Cells and I would need it to run faster.
# -*- coding: utf-8 -*-
import numpy as np
from scipy.spatial.distance import cdist
from joblib import Parallel, delayed #parallelise loops
import multiprocessing as mp
num_cores = mp.cpu_count()
loc = np.array([(-1,-1,0), (0,1,0),
(1,1,0), (-1,0,0), (1,0,0), (0,-1,0), (1,-1,0),(-1,1,0),
(-1,1,1), (0,1,1), (1,1,1), (-1,0,1), (0,0,1), (1,0,1),
(-1,-1,1), (0,-1,1), (1,-1,1), (-1,1,-1), (0,1,-1), (1,1,-1),
(-1,0,-1), (0,0,-1), (1,0,-1), (-1,-1,-1), (0,-1,-1), (1,-1,-1)])
class C_Cell(object):
def __init__(self, position0):
self.c_position0=position0
class C_Cells(object):
def __init__(self, pts, c_ncell):
self.c_cells = np.array([C_Cell(pts[i]) for i in\
range(c_ncell)]) #array of cell objects according to prior samplep
self.c_position = np.array([c_cell.c_position0 for c_cell in self.c_cells])
self.q_cells=np.empty((0,1))
self.q_position=np.empty((0,3),dtype=int)#np.array([(0,0,0)])
def go_inactive(self):
for i in range(5):
self.q_position=np.vstack((self.q_position, self.c_position[i]))
self.q_cells=np.append(self.q_cells, self.c_cells[i])
self.c_position=np.delete(self.c_position, (i), axis=0) #mother cell leaves division pool
self.c_cells=np.delete(self.c_cells, (i), axis=0)
def active_again(self,e_position,s_position):
num_cores = mp.cpu_count()
qs=np.array(range(self.q_cells.shape[0]))
def inner_active_again(s):
p=self.q_position[qs[::-1][s]]+loc
c = cdist(p, self.c_position)==0
p=p[~c.any(axis=1)]
d= cdist(p, e_position)==0
p=p[~d.any(axis=1)]
f=cdist(p, s_position)==0
p=p[~f.any(axis=1)]
g=cdist(p, self.q_position)==0
p=p[~g.any(axis=1)]
if p.shape[0]>0:
self.c_position=np.vstack((self.c_position, self.q_position[qs[::-1][s]])) #cell goes into quiescence pool
self.c_cells=np.append(self.c_cells, self.q_cells[qs[::-1][s]])
self.q_position=np.delete(self.q_position, (qs[::-1][s]), axis=0) #cell get removed out of division pool
self.q_cells=np.delete(self.q_cells, (qs[::-1][s]), axis=0)
Parallel(n_jobs=num_cores)(delayed(inner_active_again)(i) for i in range(self.q_cells.shape[0]))
# for i in range(self.q_cells.shape[0]):
# inner_active_again(i)
####################
a=loc+1
c=loc-1
b=np.vstack((a,loc,c))
e_position=np.array([[1,2,3]])
s_position=np.array([[3,3,3]])
s_cells=1
c_ncell=52
l=C_Cells(b, c_ncell)
t=0
while t<100:
t+=1
print('t=',t)
l.go_inactive()
l.active_again(e_position,s_position)
Any help regarding this issue would be highly appreciated.

Related

Python multiprocessing producing unstable results

Can anyone help me understand why this simple example of trying to speed up a for loop using python's multiprocessing module produces unstable results? I use a Manager.List to store the values from the child processes.
Clearly I'm doing at least one thing wrong. What would be the correct way to do this?
import numpy as np
import multiprocessing
from matplotlib import pyplot as plt
from functools import partial
from multiprocessing import Manager
def run_parallel(x_val, result):
val = np.arctan(x_val)
result.append(val)
def my_func(x_array, parallel=False):
if not parallel:
result = []
for k in x_array:
result.append(np.arctan(k))
return result
else:
manager = Manager()
m_result = manager.list()
pool = multiprocessing.Pool(4)
pool.map(partial(run_parallel, result=m_result), x_array)
return list(m_result)
test_x = np.linspace(0.1,1,50)
serial = my_func(test_x,parallel=False)
parallel = my_func(test_x,parallel=True)
plt.figure()
plt.plot(test_x, serial, label='serial')
plt.plot(test_x,parallel, label='parallel')
plt.legend(loc='best')
plt.show()
The output I'm getting looks like this
and it looks different every time this runs.

I added some print functions and it turned out that the order of elements from x_array is arbitrary... That's why it looks so weird. I think you should keep argument and value of arctan pairs and then order it by argument value
EDIT
I read more and it turned out that map returns values in order... This works as you wanted:
import numpy as np
import multiprocessing
from matplotlib import pyplot as plt
from functools import partial
from multiprocessing import Manager
def run_parallel(x_val, result):
val = np.arctan(x_val)
return val
def my_func(x_array, parallel=False):
if not parallel:
result = []
for k in x_array:
result.append(np.arctan(k))
return result
else:
manager = Manager()
m_result = manager.list()
pool = multiprocessing.Pool(4)
x = pool.map(partial(run_parallel, result=m_result), x_array)
return list(x)
test_x = np.linspace(0.1,1,50)
parallel = my_func(test_x,parallel=True)
plt.figure()
plt.plot(test_x,parallel, label='parallel')
plt.legend(loc='best')
plt.show()

Advancing tensorflow dataset iterator in python multiprocessing Queue

Is there any way to move the iterator in this example?
import tensorflow as tf
import numpy as np
from multiprocessing import Process, Queue
def store(batch, queue):
while True:
queue.put(batch)
if __name__=='__main__':
pqueue = Queue()
a1 = np.arange(1000)
m = tf.data.Dataset.from_tensor_slices(a1).repeat().batch(1)
iter_m = m.make_one_shot_iterator()
m_init_ops = iter_m.make_initializer(m)
next_m = iter_m.get_next()
with tf.Session() as sess:
batch = sess.run(next_m)
pp_process = Process(target=store,args=(batch, pqueue,))
pp_process.daemon = True
pp_process.start()
for i in range(10):
print(pqueue.get())
My idea is to store processed data in the queue that can be accessed by tensorflow for training, unfortunately I could not advance the iterator. Any suggestions will be greatly appreciated.
The current output is
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]

Tensorflow multithreading
The iterator is not advancing since you are technically only executing the get_next operation once: sess.run(next_m). If you were only using tensorflow multithreading, you could have obtained the desired results by simply moving it into the store function:
def store(sess, next_m, queue):
while True:
queue.put(sess.run(next_m))
# batch = sess.run(next_m) <- Remove
pp_process = Thread(target=store,args=(sess, next_m, pqueue,)) # <- Thread with correct args passed
Tensorflow multiprocessing
However, for multiprocessing, you should also ensure that you never instantiate (fork) a new process after already having created a session since the session object is not serializable.
In your case, you can simply create a new session in the store function and start the main session after forking:
from multiprocessing import Process, Queue
import numpy as np
import tensorflow as tf
def store(next_m, queue):
with tf.Session() as sess:
while True:
queue.put(sess.run(next_m))
if __name__ == '__main__':
...
pp_process = Process(target=store, args=(next_m, pqueue,))
pp_process.daemon = True
pp_process.start() # <- Fork before starting this session!
with tf.Session() as sess:
for i in range(10):
print(pqueue.get())

Python 3.7 : multiprocessing a for loop with shared variables

first a bit of context :
I'm trying to write down a python script to convert Image in greyscale (.tif) to a .jpeg with the so called ''jet'' colormap. I managed to do it with a for loop but it's a bit long for one image (millions of pixels to treat !), so I would like to use multiprocessing.
My problem here is that to convert each grey pixel into a coloured one I have to use two variables (the minimum value of light intensity ''min_img'' and an vector ''dx_cm'' to go from the initial grey scale to a 256 scale, corresponding to the jet colormap).
So to pass the information of ''min_img'' and ''dx_cm'' to the processes I try to use multiprocessing.Value() but in return I get the error :
RuntimeError: Synchronized objects should only be shared between processes through inheritance
I tried many different things from different sources and no matter the version of my code I'm struggling with that error. So I'm sorry if my code isn't clean, I would be very grateful if someone could help me with that.
My non-working code :
import multiprocessing
from PIL import Image
from matplotlib import cm
def fun(gr_list,dx,minp):
dx_cmp = dx.value
min_imgp = minp.value
rgb_res=list()
for i in range(len(gr_list)):
rgb_res.extend(cm.jet(round(((gr_list[i]-min_imgp)/dx_cmp)-1))[0:-1])
return rgb_res
if __name__ == '__main__':
RGB_list=list()
n = multiprocessing.cpu_count()
img = Image.open(r'some_path_to_a.tif')
Img_grey=list(img.getdata())
dx_cm = multiprocessing.Value('d',(max(Img_grey)-min(Img_grey))/256)
min_img = multiprocessing.Value('d',min(Img_grey))
with multiprocessing.Pool(n) as p:
RGB_list = list(p.map(fun, (Img_grey,dx_cm,min_img)))
res = Image.frombytes("RGB", (img.size[0], img.size[1]), bytes([int(0.5 + 255*i) for i in RGB_list]))
res.save('rgb_file.jpg')
PS : Here is an example of the the initial for loop that I would like to parallelize :
from PIL import Image
from matplotlib import cm
if __name__ == '__main__':
img = Image.open(r'some_path_to_a.tif')
Img_grey = list(img.getdata())
dx_cm = (max(Img_grey)-min(Img_grey))/256
min_img = min(Img_grey)
Img_rgb = list()
for i in range(len(Img_grey)):
Img_rgb.extend(cm.jet(round(((Img_grey[i]-min_img)/dx_cm)-1))[0:-1])
res = Image.frombytes("RGB", (img.size[0], img.size[1]), bytes([int(0.5 + 255*i) for i in Img_rgb]))
res.save('rgb_file.jpg')

Your fun method is looping over some list, but in this case it will receive a "part", an item from your list, so it should return only the result of its processing.
I have changed the working code to run with multiprocessing.
As the fun method returns a list, the p.map will return a list of lists (a list of results) and that need to be flatten, were done with list extends method before.
Tried with process pool and thread pool multiprocessing, in my scenario there wasn't any performance gains.
Process multiprocessing:
from PIL import Image
from matplotlib import cm
import multiprocessing
def fun(d):
part, dx_cm, min_img = d
return cm.jet(round(((part-min_img)/dx_cm)-1))[0:-1]
if __name__ == '__main__':
img = Image.open(r'a.tif')
Img_grey = list(img.getdata())
def Gen(img_data):
dx_cm = (max(img_data)-min(img_data))/256
min_img = min(img_data)
for part in img_data:
yield part, dx_cm, min_img
n = multiprocessing.cpu_count()
with multiprocessing.Pool(n) as p:
Img_rgb = [item for sublist in p.map(fun, Gen(Img_grey)) for item in sublist]
res = Image.frombytes("RGB", (img.size[0], img.size[1]), bytes([int(0.5 + 255*i) for i in Img_rgb]))
res.save('b.jpg')
Thread multiprocessing:
from PIL import Image
from matplotlib import cm
import multiprocessing
from multiprocessing.pool import ThreadPool
if __name__ == '__main__':
img = Image.open(r'a.tif')
Img_grey = list(img.getdata())
dx_cm = (max(Img_grey)-min(Img_grey))/256
min_img = min(Img_grey)
def fun(part):
return cm.jet(round(((part-min_img)/dx_cm)-1))[0:-1]
n = multiprocessing.cpu_count()
with ThreadPool(n) as p:
Img_rgb = [item for sublist in p.map(fun, Img_grey) for item in sublist]
res = Image.frombytes("RGB", (img.size[0], img.size[1]), bytes([int(0.5 + 255*i) for i in Img_rgb]))
res.save('b.jpg')

So it seems that the computational burden isn't big enough for multiprocessing to be helpful.
Nevertheless, for those coming across this topic interested in the image processing part of my question, I found another much quicker way (15 to 20 x than previous method) to do the same thing without a for loop :
from matplotlib import cm
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import numpy as np
from PIL import Image
cm_jet = cm.get_cmap('jet')
img_src = Image.open(r'path to your grey image')
img_src.mode='I'
Img_grey = list(img_src.getdata())
max_img = max(Img_grey)
min_img = min(Img_grey)
rgb_array=np.uint8(cm_jet(((np.array(img_src)-min_img)/(max_img-min_img)))*255)
ax = plt.subplot(111)
im = ax.imshow(rgb_array, cmap='jet')
divider = make_axes_locatable(ax)
cax_plot = divider.append_axes("right", size="5%", pad=0.05)
cbar=plt.colorbar(im, cax=cax_plot, ticks=[0,63.75,127.5,191.25,255])
dx_plot=(max_img-min_img)/255
cbar.ax.set_yticklabels([str(min_img),str(round(min_img+63.75*dx_plot)),str(round(min_img+127.5*dx_plot)),str(round(min_img+191.25*dx_plot)), str(max_img)])
ax.axes.get_xaxis().set_visible(False)
ax.axes.get_yaxis().set_visible(False)
plt.savefig('test_jet.jpg', quality=95, dpi=1000)

Python OpenCV speedup for multiprocessing

I am trying to run my image processing algorithm on a live feed from the webcam.
I want this to run in a parallel process from the multiprocessing module, how can i implement this?
This is my current code without parallel coding:
from cv2 import VideoCapture , imshow , waitKey ,imwrite
import numpy as np
from time import time
def greenify (x):
return some_value
skip = 4
video = VideoCapture(0)
video.set(3,640/skip)
video.set(4,480/skip)
total = 0
top_N = 100
while True:
image = video.read()[1]
if waitKey(1) == 27:
break
arr = array([list(map(greenify,j)) for j in image])
result = unravel_index(argpartition(arr,arr.size-top_N,axis=None)[-top_N:], arr.shape)
centre = skip*np.median(result[0]) , skip*np.median(result[1])
imshow('Feed', image)
print('Time taken:',total)
video.release()

I have modified your code, basically, you make it a function, then you call it in parallel. call bob.start() wherever you want in the code, and within a few miliseconds, the parallel code will run
import numpy as np
from cv2 import VideoCapture
from multiprocessing import Process, Manager
import multiprocessing as mp
def getcors():
skip = 4
top_N = 100
video = VideoCapture(0)
video.set(3,640/skip)
video.set(4,480/skip)
while True:
frame = video.read()[1]
arr = np.array([list(map(greenify,j)) for j in frame])
result = np.unravel_index(np.argpartition(arr,arr.size-top_N,axis=None)[-top_N:], arr.shape)
centre = skip * np.median(result[1]) , skip*np.median(result[0])
bob = Process(target = getcors)

how to use multi-threading for optimizing face detection?

I have a code which uses a list of image URLs from a CSV file and then performs face detection on those images after which it loads some models and does predictions on those images.
I did some load tests and found that the get_face function in the code takes a major chunk of the time required to produce the results and the extra time is taken by the pickle file created for predictions.
Question: Is there a possibility that by running these processes in threads, time can be reduced and also how this can be done in a multi threading way?
Here is the code example:
from __future__ import division
import numpy as np
from multiprocessing import Process, Queue, Pool
import os
import pickle
import pandas as pd
import dlib
from skimage import io
from skimage.transform import resize
df = pd.read_csv('/home/instaurls.csv')
detector = dlib.get_frontal_face_detector()
img_width, img_height = 139, 139
confidence = 0.8
def get_face():
output = None
data1 = []
for row in df.itertuples():
img = io.imread(row[1])
dets = detector(img, 1)
for i, d in enumerate(dets):
img = img[d.top():d.bottom(), d.left():d.right()]
img = resize(img, (img_width, img_height))
output = np.expand_dims(img, axis=0)
break
data1.append(output)
data1 = np.concatenate(data1)
return data1
get_face()
csv sample
data
https://scontent-frt3-2.cdninstagram.com/t51.2885-19/s320x320/23101834_1502115223199537_1230866541029883904_n.jpg
https://scontent-frt3-2.cdninstagram.com/t51.2885-19/s320x320/17883193_940000882769400_8455736118338387968_a.jpg
https://scontent-frt3-2.cdninstagram.com/t51.2885-19/s320x320/22427207_1737576603205281_7879421442167668736_n.jpg
https://scontent-frt3-2.cdninstagram.com/t51.2885-19/s320x320/12976287_1720757518213286_1180118177_a.jpg
https://scontent-frt3-2.cdninstagram.com/t51.2885-19/s320x320/23101834_1502115223199537_1230866541029883904_n.jpg
https://scontent-frx5-1.cdninstagram.com/t51.2885-19/s320x320/16788491_748497378632253_566270225134125056_a.jpg
https://scontent-frx5-1.cdninstagram.com/t51.2885-19/s320x320/21819738_128551217878233_9151523109507956736_n.jpg
https://scontent-frx5-1.cdninstagram.com/t51.2885-19/s320x320/14295447_318848895135407_524281974_a.jpg
https://scontent-frx5-1.cdninstagram.com/t51.2885-19/s320x320/18160229_445050155844926_2783054824017494016_a.jpg
https://scontent-frt3-2.cdninstagram.com/t51.2885-19/s320x320/23101834_1502115223199537_1230866541029883904_n.jpg
https://scontent-frt3-2.cdninstagram.com/t51.2885-19/s320x320/17883193_940000882769400_8455736118338387968_a.jpg
https://scontent-frt3-2.cdninstagram.com/t51.2885-19/s320x320/22427207_1737576603205281_7879421442167668736_n.jpg
https://scontent-frt3-2.cdninstagram.com/t51.2885-19/s320x320/12976287_1720757518213286_1180118177_a.jpg
https://scontent-frt3-2.cdninstagram.com/t51.2885-19/s320x320/23101834_1502115223199537_1230866541029883904_n.jpg
https://scontent-frx5-1.cdninstagram.com/t51.2885-19/s320x320/16788491_748497378632253_566270225134125056_a.jpg
https://scontent-frx5-1.cdninstagram.com/t51.2885-19/s320x320/21819738_128551217878233_9151523109507956736_n.jpg
https://scontent-frx5-1.cdninstagram.com/t51.2885-19/s320x320/14295447_318848895135407_524281974_a.jpg
https://scontent-frx5-1.cdninstagram.com/t51.2885-19/s320x320/18160229_445050155844926_2783054824017494016_a.jpg
https://scontent-frt3-2.cdninstagram.com/t51.2885-19/s320x320/23101834_1502115223199537_1230866541029883904_n.jpg

Here is how you could try to do it in parallel:
from __future__ import division
import numpy as np
from multiprocessing import Process, Queue, Pool
import os
import pickle
import pandas as pd
import dlib
from skimage import io
from skimage.transform import resize
from csv import DictReader
df = DictReader(open('/home/instaurls.csv')) # DictReader is iterable
detector = dlib.get_frontal_face_detector()
img_width, img_height = 139, 139
confidence = 0.8
def get_face(row):
"""
Here row is dictionary where keys are CSV header names
and values are values from current CSV row.
"""
output = None
img = io.imread(row[1]) # row[1] has to be changed to row['data']?
dets = detector(img, 1)
for i, d in enumerate(dets):
img = img[d.top():d.bottom(), d.left():d.right()]
img = resize(img, (img_width, img_height))
output = np.expand_dims(img, axis=0)
break
return output
if __name__ == '__main__':
pool = Pool() # default to number CPU cores
data = list(pool.imap(get_face, df))
print np.concatenate(data)
Pay attention to get_face and argument that it has. Also, to what it returns. This is what I meant when I said smaller chunks of work. Now get_face processes one row from CSV.
When you run this script, pool will be a reference to a instance of a Pool and you then call get_face for each row/tuple in df.itertuples().
After everything is done, data holds processing data and then you do np.concatenate on it.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

parallelizing for loop in class with joblib: pickable error - python

Related

Python multiprocessing producing unstable results

Advancing tensorflow dataset iterator in python multiprocessing Queue

Python 3.7 : multiprocessing a for loop with shared variables

Python OpenCV speedup for multiprocessing

how to use multi-threading for optimizing face detection?

Categories

Resources