Avoid memory re-allocation in tensorflow while_loop - python

In every step of the while_loop, I want to update a 0.5 GB variable. I cannot avoid the loop because each iteration depends on the previous iteration. My program need to run the while loop for 100 million times.
To test the performance of tf.while in this scenario, I make a test. The update here is simply adding a constant to the variable.
However, even this simple loop takes 24 seconds and requires 4 times 1 GB memory. I suspect the loop is constantly trying to reallocate 1 GB chunks of memory, which is horribly slow on a GPU. The GPU has 4 GB memory, when I set the variable to 2 GB, I get oom.
Is it possible to avoid the re-allocation?
I can use x as a loop variable instead of using the tf.control_dependencies. But that uses a bit more memory.
tf.contrib.compiler.jit.experimental_jit_scope leads to oom.
Thanks.
Test:
import tensorflow as tf
import numpy as np
from functools import partial
from timeit import default_timer as timer
def body1(x, i):
a = tf.assign(x, x + 0.001)
with tf.control_dependencies([a]):
return i + 1
def make_loop1(x, end_ix):
i = tf.Variable(0, name="i", dtype=np.int32)
cond = lambda i2: tf.less(i2, end_ix)
body = partial(body1, x)
return tf.while_loop(
cond, body, [i], back_prop=False,
parallel_iterations=1)
def main():
N = int(1e9 / 4)
x = tf.get_variable('x', shape=N, dtype=np.float32,
initializer=tf.ones_initializer)
end_ix = tf.constant(int(1000), dtype=np.int32)
loop1 = make_loop1(x, end_ix)
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init_op)
print("running_loop1")
st = timer()
sess.run(loop1)
en = timer()
print(st - en)
print(sess.run(x[0]))
main()

Related

tf.data.datasets set each batch (prefetch)

I am looking for help thinking through this.
I have a function (that is not a generator) that will give me any number of samples.
Let's say that getting all the data I want to train (1000 samples) can't fit into memory.
So I want to call this function 10 times to get smaller number of samples that fit into memory.
This is a dummy example for simplicity.
def get_samples(num_samples: int, random_seed=0):
np.random.seed(random_seed)
x = np.random.randint(0,100, num_samples)
y = np.random.randint(0,2, num_samples)
return np.array(list(zip(x,y))
Again lets say get_samples(1000,0) won't fit into memory.
So in theory I am looking for something like this:
batch_size = 100
total_num_samples = 1000
batches = []
for i in range(total_num_samples//batch_size):
batches.append(get_samples(batch_size, i))
But this still loads everything into memory.
Again this function is a dummy representation and the real one is already defined and not a generator.
In tf land. I was hoping that:
tf.data.Dataset.batch[0] would equal to the output of get_data(100,0)
tf.data.Dataset.batch[1] would equal to the output of get_data(100,1)
tf.data.Dataset.batch[2] would equal to the output of get_data(100,2)
...
tf.data.Dataset.batch[9] would equal to the output of get_data(100,9)
I understand that I can use tf.data.Datasets with a generator (and I think you can set a generator per batch). But the function I have gives more than a single sample. The set up is too expensive to set it up for a every single sample.
I was wanting to use tf.data.Dataset.prefetch() to run the get_batch function on every batch. And of course, it would call the get_batch with the same parameters on every epoch.
Sorry if the explaination is convoluted. Trying my best to describe the problem.
Anyone have any ideas?
This what I came up with:
def simple_static_synthesizer(batch_size, seed=1, verbose=True):
if verbose:
print(f"Creating Synthetic Data with seed {seed}")
rng = np.random.default_rng(seed)
all_x = []
all_y = []
for i in range(batch_size):
x = np.array(np.concatenate((rng.integers(0,100, 1, dtype=int), rng.integers(0,100, 1, dtype=int), rng.integers(0,100, 1, dtype=int))))
y = np.array(rng.integers(0,2,1, dtype=int))
all_x.append(x)
all_y.append(y)
return all_x, all_y
def my_generator(total_size, batch_size, seed=0, verbose=True):
counter = 0
for i in range(total_size):
# Regenerate for each batch
if counter%batch_size == 0: # Regen data for every batch
x,y = simple_static_synthesizer(batch_size,seed,verbose)
seed += 1
yield x[i%batch_size],y[i%batch_size]
counter += 1
my_gen = my_generator(10,2,seed=1)
# See values
for x,y in my_gen:
print(x,y)
# Call again, this give same answer as above
my_gen = my_generator(10,2,seed=1)
for x,y in my_gen:
print(x,y)
# Dataset with small batches to see if it is doing it correctly
total_samples = 10
batch_size = 2
seed = 5
dataset = tf.data.Dataset.from_generator(
my_generator,
args=[total_samples,batch_size,seed],
output_signature=(
tf.TensorSpec(shape=(3,), dtype=tf.uint8),
tf.TensorSpec(shape=(1,), dtype=tf.uint8),
)
)
for i,(x,y) in enumerate(dataset):
print(x.numpy(),y.numpy())
if i == 4:
break # shows first 3 syn calls
Wish we could have notebook answers!

How to free up RAM when using Juypter Notebook?

I have a Juypter Notebook where I am working with large matrices (20000x20000). I am running multiple iterations, but I am getting an error saying that I do not have enough RAM after every iteration. If I restart the kernel, I can run the next iteration, so perhaps the Juypter Notebook is running out of RAM because it stores the variables (which aren't needed for the next iteration). Is there a way to free up RAM?
Edit: I don't know if the bold segment is correct. In any case, I am looking to free up RAM, any suggestions are welcome.
## Outputs:
two_moons_n_of_samples = [int(_) for _ in np.repeat(20000, 10)]
for i in range(len(two_moons_n_of_samples)):
# print(f'n: {two_moons_n_of_samples[i]}')
## Generate the data and the graph
X, ground_truth, fid = synthetic_data({'type': 'two_moons', 'n': two_moons_n_of_samples[i], 'fidelity': 60, 'sigma': 0.18})
N = X.shape[0]
dist_mat = sqdist(X.T, X.T)
opt = {
'graph': 'full',
'tau': 0.004,
'type': 's'
}
LS = dense_laplacian(dist_mat, opt)
## Eigenvalues and eigenvectors
tic = time.time() ## Time how long to calculate eigenvalues/eigenvectors
V, E = np.linalg.eigh(LS)
idx = np.argsort(V)
V, E = V[idx], E[:, idx]
V = V / V.max()
decomposition_time = time.time() - tic
## Initialize u0
u0 = np.zeros(N)
for j in range(len(fid[0])):
u0[fid[0][j]] = 1
for j in range(len(fid[1])):
u0[fid[1][j]] = -1
## Initialize parameters
dt = 0.05
gamma = 0.07
max_iter = 100
## Run MAP estimation
tic = time.time()
u_eg, _ = probit_optimization_eig(E, V, u0, dt, gamma, fid, max_iter)
eg_time = time.time() - tic
## Run MAP estimation with CG
tic2 = time.time()
u_cg, _ = probit_optimization_cg(LS, u0, dt, gamma, fid, max_iter)
cg_time = time.time() - tic2
## Write to file:
with open('results2_two_moons_egvscg.txt', 'a') as f:
f.write(f'{i},{two_moons_n_of_samples[i]},{decomposition_time + eg_time},{cg_time}\n')
Error:
MemoryError: Unable to allocate 1.07 GiB for an array with shape (12000, 12000) and data type float64
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
~\AppData\Local\Temp\2/ipykernel_2344/941022539.py in <module>
11 'type': 's'
12 }
---> 13 LS = dense_laplacian(dist_mat, opt)
14
15 ## Eigenvalues and eigenvectors
C:/Users/\util\graph\dense_laplacian.py in dense_laplacian(dist_mat, opt)
69 D_inv_sqrt = 1.0 / np.sqrt(D)
70 D_inv_sqrt = np.diag(D_inv_sqrt)
---> 71 L = np.eye(W.shape[0]) - D_inv_sqrt # W # D_inv_sqrt
72 # L = 0.5 * (L + L.T)
73 if opt['type'] == 'rw':
MemoryError: Unable to allocate 1.07 GiB for an array with shape (12000, 12000) and data type float64
I faced the same problem, the way I solved it was -
Writing Functions wherever preprocessing is required and returning only preprocessed variables.
Deleting used huge variables just use del x
Clearing Garbage
import gc
gc.collect()
Sometimes clearing garbage doesn't helps and i used to clear the cache as well by using
import ctypes
libc = ctypes.CDLL("libc.so.6") # clearing cache
libc.malloc_trim(0)
I tried to batch my code as far as possible.
I think the best solution for you would be to batch the matrix multiplication. Libraries like TensorFlow and PyTorch does it by default, not sure about NumPy though. Check - https://www.tensorflow.org/api_docs/python/tf/linalg/matmul ( An API for matrix multiplication in batches ). Most of modern-day GPU calculations are possible due to batching !
I would suggest adding more swap space which is really easy and will probably save you more time and headache than redesigning the code to be less wasteful or trying to delete and garbage collect unnecessary objects. It would of course be slower than using ram memory since it will use the disk to simulate the extra memory needed.
Excellent answer on how to do this on ubuntu, link

How to speed up Cupy with Streams correctly?

I want to use cuda streams in order to speed up small calculations on the GPU. My test so far consists of the following:
import cupy as xp
import time
x = xp.random.randn(10003, 20000) + 1j * xp.random.randn(10003, 20000)
y = xp.zeros_like(x)
nStreams = 16
streams = [xp.cuda.stream.Stream() for ii in range(nStreams)]
f = xp.fft.fft(x[:,:200])
t = time.time()
for ii in range(int(x.shape[1]/100)):
ss = streams[ii % nStreams]
with ss:
y[:,ii*200:(ii+1)*200] = xp.fft.fft(x[:,ii*200:(ii+1)*200], axis=0)
for ii,ss in enumerate(streams):
ss.synchronize()
print(time.time()-t)
t = time.time()
for ii in range(int(x.shape[1]/100)):
y[:,ii*200:(ii+1)*200] = xp.fft.fft(x[:,ii*200:(ii+1)*200], axis=0)
xp.cuda.Stream.null.synchronize()
print(time.time()-t)
produces
[user#pc snippets]$ intelpython3 strm.py
0.019365549087524414
0.018717050552368164
which I have trouble believing that I do everything correctly. Additionally, the situation becomes even more severe when replacing the FFT-calls with calls to xp.sum, which yields
[user#pc snippets]$ intelpython3 strm.py
0.002195596694946289
0.001004934310913086
What is the rationale behind cupy streams? How do I use them to my advantage?

Sampling from tensorflow Dataset into same tensor multiple times per single session.run() call

Consider the following example:
import tensorflow as tf
import numpy as np
X = np.arange(4).reshape(4, 1) + (np.arange(3) / 10).reshape(1, 3)
batch = tf.data.Dataset.from_tensor_slices(X) \
.batch(2).make_one_shot_iterator().get_next()
def foo(x):
return x + 1
tensor = foo(batch)
Now, I'm looking for a way to be able to sample tensor multiple times per single session.run() call, i.e.:
def bar(x):
return x - 1
result1 = bar(tensor)
with tf.control_dependencies([result1]):
op = <create operation to sample from dataset into `tensor` again>
with tf.control_dependencies([op]):
result2 = bar(tensor)
sess = tf.Session()
print(*sess.run([result1, result2]), sep='\n\n')
which should output:
[[0. 0.1 0.2]
[1. 1.1 1.2]]
[[2. 2.1 2.2]
[3. 3.1 3.2]]
Is that even possible? I know one can call get_next() multiple times to get multiple dataset samples in different tensor objects, but can one sample into the same tensor object?
For me the use case is such that the foo and bar parts of this code are separated, and the foo part doesn't know how many times the samples will be needed per run.
P.S.
I'm using tf 1.12. 1.13 is an option too, but not tf 2 though.
Yes, it's possible.
A couple of insights on what you've tried so far:
You can use the dataset iterator returned from make_one_shot_iterator() each time you need a new value from the dataset
You can make your own function that is part of the tf graph to pass the result through foo()
Something like this give the output you want (as I understand it)
import tensorflow as tf
import numpy as np
X = np.arange(4).reshape(4, 1) + (np.arange(3) / 10).reshape(1, 3)
iterator = tf.data.Dataset.from_tensor_slices(X) \
.batch(2).make_one_shot_iterator()
def foo(x):
return x + 1
def get_tensor():
return foo(iterator.get_next())
tensor = get_tensor()
def bar(x):
return x - 1
result1 = bar(tensor)
with tf.control_dependencies([result1]):
op = get_tensor()
with tf.control_dependencies([op]):
result2 = bar(op)
sess = tf.Session()
print(*sess.run([result1, result2]), sep='\n\n')

tensorflow: memory allocation for a 'for' cycle

I am trying to use TensorFlow for calculating minimum Euclidean distance between each column in the matrix and all other columns (excluding itself):
with graph.as_default():
...
def get_diversity(matrix):
num_rows = matrix.get_shape()[0].value
num_cols = matrix.get_shape()[1].value
identity = tf.ones([1, num_cols], dtype=tf.float32)
diversity = 0
for i in range(num_cols):
col = tf.reshape(matrix[:, i], [num_rows, 1])
col_extended_to_matrix = tf.matmul(neuron_matrix, identity)
difference_matrix = (col_extended_to_matrix - matrix) ** 2
sum_vector = tf.reduce_sum(difference_matrix, 0)
mask = tf.greater(sum_vector, 0)
non_zero_vector = tf.select(mask, sum_vector, tf.ones([num_cols], dtype=tf.float32) * 9e99)
min_diversity = tf.reduce_min(non_zero_vector)
diversity += min_diversity
return diversity / num_cols
...
diversity = get_diversity(matrix1)
...
When I call get_diversity() once per 1000 iterations (on the scale of 300k) it works just fine. But when I try to call it at every iteration the interpreter returns:
W tensorflow/core/common_runtime/bfc_allocator.cc:271] Ran out of memory trying to allocate 2.99MiB. See logs for memory state.
I was thinking that was because TF creates a new set of variables each time get_diversity() is called. I tried this:
def get_diversity(matrix, scope):
scope.reuse_variables()
...
with tf.variable_scope("diversity") as scope:
diversity = get_diversity(matrix1, scope)
But it did not fix the problem.
How can I fix this allocation issue and use get_diversity() with large number of iterations?
Assuming you call get_diversity() multiple times in your training loop, Aaron's comment is a good one: instead you can do something like the following:
diversity_input = tf.placeholder(tf.float32, [None, None], name="diversity_input")
diversity = get_diversity(matrix)
# ...
with tf.Session() as sess:
for _ in range(NUM_ITERATIONS):
# ...
diversity_val = sess.run(diversity, feed_dict={diversity_input: ...})
This will avoid creating new operations each time round the loop, which should prevent the memory leak. This answer has more details.

Categories