Is there any way to move the iterator in this example?
import tensorflow as tf
import numpy as np
from multiprocessing import Process, Queue
def store(batch, queue):
while True:
queue.put(batch)
if __name__=='__main__':
pqueue = Queue()
a1 = np.arange(1000)
m = tf.data.Dataset.from_tensor_slices(a1).repeat().batch(1)
iter_m = m.make_one_shot_iterator()
m_init_ops = iter_m.make_initializer(m)
next_m = iter_m.get_next()
with tf.Session() as sess:
batch = sess.run(next_m)
pp_process = Process(target=store,args=(batch, pqueue,))
pp_process.daemon = True
pp_process.start()
for i in range(10):
print(pqueue.get())
My idea is to store processed data in the queue that can be accessed by tensorflow for training, unfortunately I could not advance the iterator. Any suggestions will be greatly appreciated.
The current output is
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
Tensorflow multithreading
The iterator is not advancing since you are technically only executing the get_next operation once: sess.run(next_m). If you were only using tensorflow multithreading, you could have obtained the desired results by simply moving it into the store function:
def store(sess, next_m, queue):
while True:
queue.put(sess.run(next_m))
# batch = sess.run(next_m) <- Remove
pp_process = Thread(target=store,args=(sess, next_m, pqueue,)) # <- Thread with correct args passed
Tensorflow multiprocessing
However, for multiprocessing, you should also ensure that you never instantiate (fork) a new process after already having created a session since the session object is not serializable.
In your case, you can simply create a new session in the store function and start the main session after forking:
from multiprocessing import Process, Queue
import numpy as np
import tensorflow as tf
def store(next_m, queue):
with tf.Session() as sess:
while True:
queue.put(sess.run(next_m))
if __name__ == '__main__':
...
pp_process = Process(target=store, args=(next_m, pqueue,))
pp_process.daemon = True
pp_process.start() # <- Fork before starting this session!
with tf.Session() as sess:
for i in range(10):
print(pqueue.get())
Related
Whenever I try to use shared memory with pythons 'multiprocessing' module to fill a huge array in parallel I use something like:
import numpy as np
from multiprocessing import Process, RawArray
def tf(x, arr):
arr = np.reshape( np.frombuffer( arr, dtype=np.float32 ), -1 ).reshape((10, 10, 10))
arr[x] = np.random.random((10, 10))
mpa = RawArray('f', 1000)
ncpu = 4
procs = []
for i in range(10):
procs.append(Process(target=tf, args=(i, mpa)))
procs[-1].start()
if len(procs) == ncpu:
procs[0].join()
procs.pop(0)
for p in procs:
p.join()
arr = np.reshape( np.frombuffer( mpa, dtype=np.uint32 ), -1).reshape((10, 10, 10))
to ensure that only as many processes are active as I have cpus. If I try to use 'Pool' and 'apply_async' the array is not altered for some reason. So I wonder if it is possible to either use 'Pool' or any other intended way to manage the amount of active processes.
The above code is working but is not the most efficient since I only if the process I added first is finished to decide if I should add another process.
I am working in a Jupyter notebook. I'm new to multiprocessing in python, and I'm trying to parallelize the calculation of a function for a grid of parameters. Here is a snippet of code quite representative of what I'm doing:
import os
import numpy as np
from concurrent.futures import ProcessPoolExecutor
def f(x,y):
print(os.getpid(), x,y,x+y)
return x+y
xs = np.linspace(5,7,3).astype(int)
ys = np.linspace(1,3,3).astype(int)
func = lambda p: f(*p)
with ProcessPoolExecutor() as executor:
args = (arg for arg in zip(xs,ys))
results = executor.map(func, args)
for res in results:
print(res)
The executor doesn't even start.
No problem whatsoever if I serially execute the same with, e.g. list comprehension,
args = (arg for arg in zip(xs,ys))
results = [func(arg) for arg in args]
Are you running on Windows? I think your main problem is that each process is trying to re-execute your whole script, so you should include an if name == "main" check. I think you have a second issue trying to use a lambda function that can't be pickled, since the processes communicate by pickling the data. There are work-arounds for that but in this case it looks like you don't really need the lambda. Try something like this:
import os
import numpy as np
from concurrent.futures import ProcessPoolExecutor
def f(x, y):
print(os.getpid(), x, y, x + y)
return x + y
if __name__ == '__main__':
xs = np.linspace(5, 7, 3).astype(int)
ys = np.linspace(1, 3, 3).astype(int)
with ProcessPoolExecutor() as executor:
results = executor.map(f, xs, ys)
for res in results:
print(res)
Can anyone help me understand why this simple example of trying to speed up a for loop using python's multiprocessing module produces unstable results? I use a Manager.List to store the values from the child processes.
Clearly I'm doing at least one thing wrong. What would be the correct way to do this?
import numpy as np
import multiprocessing
from matplotlib import pyplot as plt
from functools import partial
from multiprocessing import Manager
def run_parallel(x_val, result):
val = np.arctan(x_val)
result.append(val)
def my_func(x_array, parallel=False):
if not parallel:
result = []
for k in x_array:
result.append(np.arctan(k))
return result
else:
manager = Manager()
m_result = manager.list()
pool = multiprocessing.Pool(4)
pool.map(partial(run_parallel, result=m_result), x_array)
return list(m_result)
test_x = np.linspace(0.1,1,50)
serial = my_func(test_x,parallel=False)
parallel = my_func(test_x,parallel=True)
plt.figure()
plt.plot(test_x, serial, label='serial')
plt.plot(test_x,parallel, label='parallel')
plt.legend(loc='best')
plt.show()
The output I'm getting looks like this
and it looks different every time this runs.
I added some print functions and it turned out that the order of elements from x_array is arbitrary... That's why it looks so weird. I think you should keep argument and value of arctan pairs and then order it by argument value
EDIT
I read more and it turned out that map returns values in order... This works as you wanted:
import numpy as np
import multiprocessing
from matplotlib import pyplot as plt
from functools import partial
from multiprocessing import Manager
def run_parallel(x_val, result):
val = np.arctan(x_val)
return val
def my_func(x_array, parallel=False):
if not parallel:
result = []
for k in x_array:
result.append(np.arctan(k))
return result
else:
manager = Manager()
m_result = manager.list()
pool = multiprocessing.Pool(4)
x = pool.map(partial(run_parallel, result=m_result), x_array)
return list(x)
test_x = np.linspace(0.1,1,50)
parallel = my_func(test_x,parallel=True)
plt.figure()
plt.plot(test_x,parallel, label='parallel')
plt.legend(loc='best')
plt.show()
In every step of the while_loop, I want to update a 0.5 GB variable. I cannot avoid the loop because each iteration depends on the previous iteration. My program need to run the while loop for 100 million times.
To test the performance of tf.while in this scenario, I make a test. The update here is simply adding a constant to the variable.
However, even this simple loop takes 24 seconds and requires 4 times 1 GB memory. I suspect the loop is constantly trying to reallocate 1 GB chunks of memory, which is horribly slow on a GPU. The GPU has 4 GB memory, when I set the variable to 2 GB, I get oom.
Is it possible to avoid the re-allocation?
I can use x as a loop variable instead of using the tf.control_dependencies. But that uses a bit more memory.
tf.contrib.compiler.jit.experimental_jit_scope leads to oom.
Thanks.
Test:
import tensorflow as tf
import numpy as np
from functools import partial
from timeit import default_timer as timer
def body1(x, i):
a = tf.assign(x, x + 0.001)
with tf.control_dependencies([a]):
return i + 1
def make_loop1(x, end_ix):
i = tf.Variable(0, name="i", dtype=np.int32)
cond = lambda i2: tf.less(i2, end_ix)
body = partial(body1, x)
return tf.while_loop(
cond, body, [i], back_prop=False,
parallel_iterations=1)
def main():
N = int(1e9 / 4)
x = tf.get_variable('x', shape=N, dtype=np.float32,
initializer=tf.ones_initializer)
end_ix = tf.constant(int(1000), dtype=np.int32)
loop1 = make_loop1(x, end_ix)
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init_op)
print("running_loop1")
st = timer()
sess.run(loop1)
en = timer()
print(st - en)
print(sess.run(x[0]))
main()
'''Import libraries for simulation'''
import tensorflow as tf
import numpy as np
'''Imports for visualization'''
from PIL.Image
from io import BytesIO
from IPython.display import Image, display
'''Now we'll define a function to actually display the image once we have
iteration counts'''
def DisplayFractal(a, fmt='jpeg'):
img =np.concatenate([10+20*np.cos(a_cyclic),30+50*np.sin(a_cyclic),155-
80*np.cos(a_cyclic)], 2)
img[a==a.max()] = 0
a = img
a = np.uint8(np.clip(a, 0, 255))
f = BytesIO()
PIL.Image.fromarray(a).save(f, fmt)
display(Image(data=f.getvalue()))
sess = tf.InteractiveSession()
# Use NumPy to create a 2D array of complex numbers
Y, X = np.mgrid[-1.3:1.3:0.005, -2:1:0.005]
Z = X+1j*Y
print(Z)
#Now we define and initialize TensorFlow tensors.
xs = tf.constant(Z.astype(np.complex64))
zs = tf.Variable(xs)
ns = tf.Variable(tf.zeros_like(xs, tf.float32))
tf.global_variables_initializer().run()
zs_ = zs*zs + xs
print(zs)
# Have we diverged with this new value?
not_diverged = tf.abs(zs_) < 4
'''
Operation to update the zs and the iteration count.
Note: We keep computing zs after they diverge! This
is very wasteful! There are better, if a little
less simple, ways to do this.
'''
step = tf.group(zs.assign(zs_), ns.assign_add(tf.cast(not_diverged,
tf.float32)))
for i in range(200): step.run()
DisplayFractal(ns.eval())
I had the same problem. You have to run the TensorFlow example in Jupyter notebook:
http://jupyter.org/
If you run it from other IDEs like (Spyder) all you will see is <IPython.core.display.Image object> in the console.
emmm,I have destroy this problem,you can take a look on my function:
def displayFractal(a,fmt='jpeg'):
a_cyclic=(6.28*a/200.0).reshape(list(a.shape)+[1])
# emmm I have changed the number. you can just continue your number
img=np.concatenate([5+10*np.cos(a_cyclic),15+25*np.sin(a_cyclic),70-40*np.cos(a_cyclic)],2)
img[a==a.max()]=0
a=img
a=np.uint8(np.clip(a,0,255))
plt.imshow(PIL.Image.fromarray(a))
plt.show()
of course you should import matplotlib .pyplot as plt at first.