Memory leak in facenet library written in TensorFlow - python

I wanted to use this example https://github.com/davidsandberg/facenet/blob/master/contributed/real_time_face_recognition.py but for running face recognition on a set of images. My main function looks as follows
def main(args):
face_recognition = face.Recognition()
if args.debug:
face.debug = True
path = './images/'
folders = os.listdir(path)
for f in folders:
images = os.listdir(path + f)
for i in images:
img = cv2.imread(path + f + '/' + i)
start = time.time()
faces = face_recognition.identify(img)
for face_n in faces:
print(i, '-', face_n.name, '-', time.time() - start)
del images, path, img, faces, folders
When I checked how RAM is using, I founded a memory leak:
> 183.5 MiB 183.5 MiB #profile(stream=fp)
> def main(args):
> 383.7 MiB 200.2 MiB face_recognition = face.Recognition()
> 383.7 MiB 0.0 MiB if args.debug:
> face.debug = True
>
> 383.7 MiB 0.0 MiB path = './images/'
> 383.7 MiB 0.0 MiB folders = os.listdir(path)
> 1122.2 MiB -9.7 MiB for f in folders[:5]:
> 1113.9 MiB 0.0 MiB images = os.listdir(path + f)
> 1124.9 MiB -55.1 MiB for i in images:
> 1124.9 MiB -41.9 MiB img = cv2.imread(path + f + '/' + i)
> 1124.9 MiB -52.4 MiB start = time.time()
> 1124.9 MiB 679.9 MiB faces = face_recognition.identify(img)
> 1124.9 MiB -112.1 MiB for face_n in faces:
> 1124.9 MiB -52.7 MiB print(i, '-', face_n.name, '-', time.time() - start)
> 1122.2 MiB 0.0 MiB del images, path, img, faces, folders
Looks like the main leak is in https://github.com/davidsandberg/facenet/blob/master/contributed/face.py
> 141 389.0 MiB 389.0 MiB #profile(stream=fp)
> 142 def find_faces(self, image):
> 143 389.0 MiB 0.0 MiB faces = []
> 144
> 145 389.0 MiB 0.0 MiB bounding_boxes, _ = detect_face.detect_face(image, self.minsize,
> 146 389.0 MiB 0.0 MiB self.pnet, self.rnet, self.onet,
> 147 486.7 MiB 97.7 MiB self.threshold, self.factor)
> 148 486.7 MiB 0.0 MiB for bb in bounding_boxes:
How to solve this issue when I need to use loops?

Related

Why does my Python loop intends to consume all the memory?

I want to generate and keep a set of tuples in a certain time. Yet I found the program seemed to consume all the memory if given enough time.
I have tried two methods. One is delete the newly generated variables, the other is gc.collect(). But neither of them worked. If I just generate and not keep the tuples, the program would consume limited memory.
generate and keep: gk.py
import gc
import time
from memory_profiler import profile
from random import sample
from sys import getsizeof
#profile
def loop(limit):
t = time.time()
i = 0
A = set()
while True:
i += 1
duration = time.time() - t
a = tuple(sorted(sample(range(200), 100)))
A.add(a)
if not i % int(1e4):
print('step {:.2e}...'.format(i))
if duration > limit:
print('done')
break
# method 1: delete the variables
# del duration, a
# method 2: use gc
# gc.collect()
memory = getsizeof(t) + getsizeof(i) + getsizeof(duration) + \
getsizeof(a) + getsizeof(limit) + getsizeof(A)
print('memory consumed: {:.2e}MB'.format(memory/2**20))
pass
def main():
limit = 300
loop(limit)
pass
if __name__ == '__main__':
print('running...')
main()
generate and not keep: gnk.py
import time
from memory_profiler import profile
from random import sample
from sys import getsizeof
#profile
def loop(limit):
t = time.time()
i = 0
while True:
i += 1
duration = time.time() - t
a = tuple(sorted(sample(range(200), 100)))
if not i % int(1e4):
print('step {:.2e}...'.format(i))
if duration > limit:
print('done')
break
memory = getsizeof(t) + getsizeof(i) + getsizeof(duration) + \
getsizeof(a) + getsizeof(limit)
print('memory consumed: {:.2e}MB'.format(memory/2**20))
pass
def main():
limit = 300
loop(limit)
pass
if __name__ == '__main__':
print('running...')
main()
use "mprof" (needs module memory_profiler) in cmd/shell to check memory usage
mprof run my_file.py
mprof plot
result of gk.py
memory consumed: 4.00e+00MB
Filename: gk.py
Line # Mem usage Increment Line Contents
================================================
12 32.9 MiB 32.9 MiB #profile
13 def loop(limit):
14 32.9 MiB 0.0 MiB t = time.time()
15 32.9 MiB 0.0 MiB i = 0
16 32.9 MiB 0.0 MiB A = set()
17 32.9 MiB 0.0 MiB while True:
18 115.8 MiB 0.0 MiB i += 1
19 115.8 MiB 0.0 MiB duration = time.time() - t
20 115.8 MiB 0.3 MiB a = tuple(sorted(sample(range(200), 100)))
21 115.8 MiB 2.0 MiB A.add(a)
22 115.8 MiB 0.0 MiB if not i % int(1e4):
23 111.8 MiB 0.0 MiB print('step {:.2e}...'.format(i))
24 115.8 MiB 0.0 MiB if duration > limit:
25 115.8 MiB 0.0 MiB print('done')
26 115.8 MiB 0.0 MiB break
27 # method 1: delete the variables
28 # del duration, a
29 # method 2: use gc
30 # gc.collect()
31 memory = getsizeof(t) + getsizeof(i) + getsizeof(duration) + \
32 115.8 MiB 0.0 MiB getsizeof(a) + getsizeof(limit) + getsizeof(A)
33 115.8 MiB 0.0 MiB print('memory consumed: {:.2e}MB'.format(memory/2**20))
34 115.8 MiB 0.0 MiB pass
result of gnk.py
memory consumed: 9.08e-04MB
Filename: gnk.py
Line # Mem usage Increment Line Contents
================================================
11 33.0 MiB 33.0 MiB #profile
12 def loop(limit):
13 33.0 MiB 0.0 MiB t = time.time()
14 33.0 MiB 0.0 MiB i = 0
15 33.0 MiB 0.0 MiB while True:
16 33.0 MiB 0.0 MiB i += 1
17 33.0 MiB 0.0 MiB duration = time.time() - t
18 33.0 MiB 0.1 MiB a = tuple(sorted(sample(range(200), 100)))
19 33.0 MiB 0.0 MiB if not i % int(1e4):
20 33.0 MiB 0.0 MiB print('step {:.2e}...'.format(i))
21 33.0 MiB 0.0 MiB if duration > limit:
22 33.0 MiB 0.0 MiB print('done')
23 33.0 MiB 0.0 MiB break
24 memory = getsizeof(t) + getsizeof(i) + getsizeof(duration) + \
25 33.0 MiB 0.0 MiB getsizeof(a) + getsizeof(limit)
26 33.0 MiB 0.0 MiB print('memory consumed: {:.2e}MB'.format(memory/2**20))
27 33.0 MiB 0.0 MiB pass
I have two problems:
both the programs consumed more memory than the variables occupied. "gk.py" consumed 115.8MB, its variables occupied 4.00MB. "gnk.py" consumed 33.0MB, its variables occupied 9.08e-04MB. Why the programs consumed more memory than the corresponding variables occupied?
memory that "gk.py" consumed increases linearly with time. memory that "gnk.py" consumed remains constantly with time. Why does this happen?
Any help would be appreciated.
Given that the size of the set is being constantly increased, there will be a time when it will eventually consume all memory.
An estimative (from my computer):
10 seconds of code running ~ 5e4 tuples saved to the set
300 seconds of code running ~ 1.5e6 tuples saved to the set
1 tuple = 100 integers ~ 400bytes
total:
1.5e6 * 400bytes = 6e8bytes = 600MB filled in 300s

Using line profiler with multiprocessing

How can you profile a python module that use multiprocessing (multiprocessing.Pool.map) so each spawned process will be also profiled line by line.
Currently I use line_profiler for profiling but it doesn't support multiprocessing.
Is there a way to do it manually? Or maybe use some other tool?
The normal way of using line_profiler of adding #profile to the function being profiled and running kernprof -v -l script.py leads to the following error for multiprocessing:
Can't pickle <class '__main__.Worker'>: attribute lookup Worker on __main__ failed.
To fix this, we have to setup the line_profiler ourselves in the sub-process we want to profile, rather than doing it globally via kernelprof:
import multiprocessing as mp
import line_profiler
class Worker(mp.Process):
def run(self):
prof = line_profiler.LineProfiler()
# Wrap all functions that you want to be profiled in this process
# These can be global functions or any class methods
# Make sure to replace instance methods on a class level, not the bound methods self.run2
Worker.run2 = prof(Worker.run2)
...
# run the main
self.run2()
# store stats in separate file for each process
prof.dump_stats('worker.lprof')
def run2(self):
# real run method renamed
...
Now running the script this generates a profile file that we can then visualize with:
python -m line_profiler worker.lprof
you could use memory_profiler like this
from memory_profiler import profile
import multiprocessing as mp
import time, psutil, gc, os
#profile(precision=4)
def array_ops(num):
gc.collect()
size1 = 10 ** num
size2 = 20 ** (num+1)
x = [1] * size1
y = [2] * size2
y *= 2
del y
gc.collect()
z = x * 2
gc.collect()
return x
if __name__ == '__main__':
num_workers = 3
pool = mp.Pool(num_workers)
pool.map(array_ops, [4,5,6])
pool.close()
pool.join()
This is a sample output
Line # Mem usage Increment Line Contents
================================================
6 34.4258 MiB 34.4258 MiB #profile(precision=4)
7 def array_ops(num):
8 34.4258 MiB 0.0000 MiB gc.collect()
9 34.4258 MiB 0.0000 MiB size1 = 10 ** num
10 34.4258 MiB 0.0000 MiB size2 = 20 ** (num+1)
11 34.5586 MiB 0.1328 MiB x = [1] * size1
12 58.7852 MiB 24.2266 MiB y = [2] * size2
13 83.2539 MiB 24.4688 MiB y *= 2
14 34.6055 MiB 0.0000 MiB del y
15 34.6055 MiB 0.0000 MiB gc.collect()
16 34.6055 MiB 0.0000 MiB z = x * 2
17 34.6055 MiB 0.0000 MiB gc.collect()
18 34.6055 MiB 0.0000 MiB return x
Filename: array_ops.py
Line # Mem usage Increment Line Contents
================================================
6 34.4258 MiB 34.4258 MiB #profile(precision=4)
7 def array_ops(num):
8 34.4258 MiB 0.0000 MiB gc.collect()
9 34.4258 MiB 0.0000 MiB size1 = 10 ** num
10 34.4258 MiB 0.0000 MiB size2 = 20 ** (num+1)
11 35.0820 MiB 0.6562 MiB x = [1] * size1
12 523.3711 MiB 488.2891 MiB y = [2] * size2
13 1011.6172 MiB 488.2461 MiB y *= 2
14 35.2969 MiB 0.0000 MiB del y
15 35.2969 MiB 0.0000 MiB gc.collect()
16 36.5703 MiB 1.2734 MiB z = x * 2
17 36.5703 MiB 0.0000 MiB gc.collect()
18 36.8242 MiB 0.2539 MiB return x
Filename: array_ops.py
Line # Mem usage Increment Line Contents
================================================
6 34.4258 MiB 34.4258 MiB #profile(precision=4)
7 def array_ops(num):
8 34.4258 MiB 0.0000 MiB gc.collect()
9 34.4258 MiB 0.0000 MiB size1 = 10 ** num
10 34.4258 MiB 0.0000 MiB size2 = 20 ** (num+1)
11 42.0391 MiB 7.6133 MiB x = [1] * size1
12 9807.7109 MiB 9765.6719 MiB y = [2] * size2
13 19573.2109 MiB 9765.5000 MiB y *= 2
14 42.1641 MiB 0.0000 MiB del y
15 42.1641 MiB 0.0000 MiB gc.collect()
16 57.3594 MiB 15.1953 MiB z = x * 2
17 57.3594 MiB 0.0000 MiB gc.collect()
18 57.3594 MiB 0.0000 MiB return x

python Memory exception when write data to Disk

I Have a big file about 8G to sort. and i split it into blocks(each 3G) then sort it, finally write sorted data into disk, when writting to disk, memory usage increase immediately even to 9G. i think the memory use shouldn't be so large, maybe the memeory didn't release after used, when i add del lines & gc.collect(), the memory use become normal,but the cost of time become 2 times of before, anyone knows why?
Here is my code
```
class Sorter(object):
def sort(self, filename='input', inp..ut_stream=None, out_filename=None, key=KEY_BY):
sort_key = key
def getLines(fname):
for _ in open(fname, 'r'):
yield (sort_key(_), _)
st = time.time()
splitter = FileSplitter(filename)
if input_stream:
splitter.split(self.block_size, input_stream=input_stream, key=sort_key, buffer_size=self.buffer_size)
else:
splitter.split(self.block_size, key=sort_key, buffer_size=self.buffer_size)
# splitter.split(self.block_size, sort_key, self.buffer_size)
print >> sys.stderr, 'sort', time.time() - st
st = time.time()
filelist = map(getLines, splitter.get_block_filenames())
r = heapq.merge(*filelist)
if not out_filename:
f = open(filename + '.out', 'w')
else:
f = open(out_filename, 'w')
map(lambda _: f.write(_[1]), r)
print >> sys.stderr, 'merge', time.time() - st
splitter.cleanup()
class FileSplitter(object):
BLOCK_FILENAME_FORMAT = 'block_{0}.dat'
def __init__(self, filename):
self.filename = filename
self.block_filenames = []
def write_block(self, data, block_number, buffer_size):
filename = self.BLOCK_FILENAME_FORMAT.format(block_number)
file = open(filename, 'w', buffer_size)
file.write(data)
file.close()
self.block_filenames.append(filename)
def get_block_filenames(self):
return self.block_filenames
def split(self, block_size, input_stream=None, key=None, buffer_size=0):
sort_key = key
if not input_stream:
file = open(self.filename, 'r', buffer_size)
else:
file = input_stream
i = 0
while True:
lines = file.readlines(block_size)
if lines == []:
break
if sort_key is None:
lines.sort()
else:
lines.sort(key=sort_key)
self.write_block(''.join(lines), i, buffer_size=buffer_size)
i += 1
def cleanup(self):
map(lambda f: os.remove(f), self.block_filenames)
```
Here is the usage of memory in each line:
```
Line # Mem usage Increment Line Contents
21 6817.344 MiB 18170.859 MiB #profile
22 def write_block(self, data, block_number, buffer_size):
23 6817.344 MiB -1652.430 MiB filename = self.BLOCK_FILENAME_FORMAT.format(block_number)
24 6817.344 MiB -1652.430 MiB file = open(filename, 'w', buffer_size)
25 6817.344 MiB -1652.430 MiB file.write(data)
26 6817.344 MiB -1652.430 MiB file.close()
27 6817.344 MiB -1652.430 MiB self.block_filenames.append(filename)
Filename: ../../isorter.py
Line # Mem usage Increment Line Contents
32 21.504 MiB 21.504 MiB #profile
33 def split(self, block_size, input_stream=None, key=None, buffer_size=0):
36 21.504 MiB 0.000 MiB sort_key = key
37 21.504 MiB 0.000 MiB if not input_stream:
38 file = open(self.filename, 'r', buffer_size)
39 else:
40 21.504 MiB 0.000 MiB file = input_stream
41 21.504 MiB 0.000 MiB i = 0
43 5164.914 MiB 0.000 MiB while True:
44 5164.801 MiB 4979.527 MiB lines = file.readlines(block_size)
45 5164.801 MiB -157.926 MiB if lines == []:
46 5006.875 MiB -157.926 MiB break
47 5164.801 MiB 0.000 MiB if sort_key is None:
48 lines.sort()
49 else:
50 5164.914 MiB 5.844 MiB lines.sort(key=sort_key)
51 5164.914 MiB 12020.230 MiB self.write_block(''.join(lines), i, buffer_size=buffer_size)
52 5164.914 MiB 0.000 MiB i += 1
53 5006.879 MiB 0.004 MiB print
Filename: ../../isorter.py
Line # Mem usage Increment Line Contents
56 5006.906 MiB 5006.906 MiB #profile
57 def cleanup(self):
58 5006.906 MiB 0.000 MiB map(lambda f: os.remove(f), self.block_filenames)
Filename: ../../isorter.py
Line # Mem usage Increment Line Contents
61 5006.898 MiB 5006.898 MiB #profile
62 def merge(filelist):
63 5006.898 MiB 0.000 MiB return heapq.merge(*filelist)
Filename: ../../isorter.py
Line # Mem usage Increment Line Contents
71 21.340 MiB 21.340 MiB #profile
72 def sort(self, filename='input', input_stream=None, out_filename=None, key=KEY_BY):
76 21.504 MiB 0.000 MiB sort_key = key
77 5006.902 MiB 0.000 MiB def getLines(fname):
78 5006.918 MiB -189.023 MiB for _ in open(fname, 'r'):
79 5006.918 MiB -378.035 MiB yield (sort_key(_), _)
80 21.504 MiB 0.000 MiB st = time.time()
83 21.504 MiB 0.000 MiB splitter = FileSplitter(filename)
84 21.504 MiB 0.000 MiB if input_stream:
85 5006.879 MiB 5006.879 MiB splitter.split(self.block_size, input_stream=input_stream, key=sort_key, buffer_size=self.buffer_size)
86 else:
87 splitter.split(self.block_size, key=sort_key, buffer_size=self.buffer_size)
88 5006.898 MiB 0.020 MiB print >> sys.stderr, 'sort', time.time() - st
90 5006.898 MiB 0.000 MiB st = time.time()
91 5006.898 MiB 0.000 MiB filelist = map(getLines, splitter.get_block_filenames())
94 5006.898 MiB 5006.898 MiB r = merge(filelist)
96 5006.898 MiB 0.000 MiB if not out_filename:
97 f = open(filename + '.out', 'w')
98 else:
99 5006.898 MiB 0.000 MiB f = open(out_filename, 'w')
100 5006.918 MiB -378.051 MiB map(lambda : f.write([1]), r)
101 5006.906 MiB -0.012 MiB print >> sys.stderr, 'merge', time.time() - st
102 5006.906 MiB 5006.906 MiB splitter.cleanup()
```
You can use numpy array instead of python's list. Numpy array use less memory than list.
for example if your code is :
x = []
for a in mylist:
x.append(a)
replace that with :
import numpy as np
x = np.array([])
for a in mylist:
x = np.append(x,a)

profiling the code with memory_profiler increasing the execution time

I am writing a simple application which splits a large text file into smaller files, and i have written 2 versions of it, one using lists and one using generators. I profiled both the versions using memory_profiler module and it clearly showed the better memory efficiency of the generators version, however strangely enough when then version using generators was profiled, it increases the execution time. Below demonstration explains what i mean
Version using Lists
from memory_profiler import profile
#profile()
def main():
file_name = input("Enter the full path of file you want to split into smaller inputFiles: ")
input_file = open(file_name).readlines()
num_lines_orig = len(input_file)
parts = int(input("Enter the number of parts you want to split in: "))
output_files = [(file_name + str(i)) for i in range(1, parts + 1)]
st = 0
p = int(num_lines_orig / parts)
ed = p
for i in range(parts-1):
with open(output_files[i], "w") as OF:
OF.writelines(input_file[st:ed])
st = ed
ed = st + p
with open(output_files[-1], "w") as OF:
OF.writelines(input_file[st:])
if __name__ == "__main__":
main()
when run with profiler
$ time py36 Splitting\ text\ files_BAD_usingLists.py
Enter the full path of file you want to split into smaller inputFiles: /apps/nttech/rbhanot/Downloads/test.txt
Enter the number of parts you want to split in: 3
Filename: Splitting text files_BAD_usingLists.py
Line # Mem usage Increment Line Contents
================================================
6 47.8 MiB 0.0 MiB #profile()
7 def main():
8 47.8 MiB 0.0 MiB file_name = input("Enter the full path of file you want to split into smaller inputFiles: ")
9 107.3 MiB 59.5 MiB input_file = open(file_name).readlines()
10 107.3 MiB 0.0 MiB num_lines_orig = len(input_file)
11 107.3 MiB 0.0 MiB parts = int(input("Enter the number of parts you want to split in: "))
12 107.3 MiB 0.0 MiB output_files = [(file_name + str(i)) for i in range(1, parts + 1)]
13 107.3 MiB 0.0 MiB st = 0
14 107.3 MiB 0.0 MiB p = int(num_lines_orig / parts)
15 107.3 MiB 0.0 MiB ed = p
16 108.1 MiB 0.7 MiB for i in range(parts-1):
17 107.6 MiB -0.5 MiB with open(output_files[i], "w") as OF:
18 108.1 MiB 0.5 MiB OF.writelines(input_file[st:ed])
19 108.1 MiB 0.0 MiB st = ed
20 108.1 MiB 0.0 MiB ed = st + p
21
22 108.1 MiB 0.0 MiB with open(output_files[-1], "w") as OF:
23 108.1 MiB 0.0 MiB OF.writelines(input_file[st:])
real 0m6.115s
user 0m0.764s
sys 0m0.052s
When run without profiler
$ time py36 Splitting\ text\ files_BAD_usingLists.py
Enter the full path of file you want to split into smaller inputFiles: /apps/nttech/rbhanot/Downloads/test.txt
Enter the number of parts you want to split in: 3
real 0m5.916s
user 0m0.696s
sys 0m0.080s
Now the one using generators
#profile()
def main():
file_name = input("Enter the full path of file you want to split into smaller inputFiles: ")
input_file = open(file_name)
num_lines_orig = sum(1 for _ in input_file)
input_file.seek(0)
parts = int(input("Enter the number of parts you want to split in: "))
output_files = ((file_name + str(i)) for i in range(1, parts + 1))
st = 0
p = int(num_lines_orig / parts)
ed = p
for i in range(parts-1):
file = next(output_files)
with open(file, "w") as OF:
for _ in range(st, ed):
OF.writelines(input_file.readline())
st = ed
ed = st + p
if num_lines_orig - ed < p:
ed = st + (num_lines_orig - ed) + p
else:
ed = st + p
file = next(output_files)
with open(file, "w") as OF:
for _ in range(st, ed):
OF.writelines(input_file.readline())
if __name__ == "__main__":
main()
When run with profiler option
$ time py36 -m memory_profiler Splitting\ text\ files_GOOD_usingGenerators.py
Enter the full path of file you want to split into smaller inputFiles: /apps/nttech/rbhanot/Downloads/test.txt
Enter the number of parts you want to split in: 3
Filename: Splitting text files_GOOD_usingGenerators.py
Line # Mem usage Increment Line Contents
================================================
4 47.988 MiB 0.000 MiB #profile()
5 def main():
6 47.988 MiB 0.000 MiB file_name = input("Enter the full path of file you want to split into smaller inputFiles: ")
7 47.988 MiB 0.000 MiB input_file = open(file_name)
8 47.988 MiB 0.000 MiB num_lines_orig = sum(1 for _ in input_file)
9 47.988 MiB 0.000 MiB input_file.seek(0)
10 47.988 MiB 0.000 MiB parts = int(input("Enter the number of parts you want to split in: "))
11 48.703 MiB 0.715 MiB output_files = ((file_name + str(i)) for i in range(1, parts + 1))
12 47.988 MiB -0.715 MiB st = 0
13 47.988 MiB 0.000 MiB p = int(num_lines_orig / parts)
14 47.988 MiB 0.000 MiB ed = p
15 48.703 MiB 0.715 MiB for i in range(parts-1):
16 48.703 MiB 0.000 MiB file = next(output_files)
17 48.703 MiB 0.000 MiB with open(file, "w") as OF:
18 48.703 MiB 0.000 MiB for _ in range(st, ed):
19 48.703 MiB 0.000 MiB OF.writelines(input_file.readline())
20
21 48.703 MiB 0.000 MiB st = ed
22 48.703 MiB 0.000 MiB ed = st + p
23 48.703 MiB 0.000 MiB if num_lines_orig - ed < p:
24 48.703 MiB 0.000 MiB ed = st + (num_lines_orig - ed) + p
25 else:
26 48.703 MiB 0.000 MiB ed = st + p
27
28 48.703 MiB 0.000 MiB file = next(output_files)
29 48.703 MiB 0.000 MiB with open(file, "w") as OF:
30 48.703 MiB 0.000 MiB for _ in range(st, ed):
31 48.703 MiB 0.000 MiB OF.writelines(input_file.readline())
real 1m48.071s
user 1m13.144s
sys 0m19.652s
When run without profiler
$ time py36 Splitting\ text\ files_GOOD_usingGenerators.py
Enter the full path of file you want to split into smaller inputFiles: /apps/nttech/rbhanot/Downloads/test.txt
Enter the number of parts you want to split in: 3
real 0m10.429s
user 0m3.160s
sys 0m0.016s
So why profiling is making my code slow first of all ? Secondly if at profiling impacts execution speed, then why this effect is not showing on version of the code using lists.
I cpu_profiled the code using line_profiler and i got the answer this time, the reason why generator's version takes more time is because of below lines
19 2 11126.0 5563.0 0.2 with open(file, "w") as OF:
20 379886 200418.0 0.5 3.0 for _ in range(st, ed):
21 379884 2348653.0 6.2 35.1 OF.writelines(input_file.readline())
And why it does not slows down for lists version is because
19 2 9419.0 4709.5 0.4 with open(output_files[i], "w") as OF:
20 2 1654165.0 827082.5 65.1 OF.writelines(input_file[st:ed])
For lists, the new file is being written by simply taking a copy of the list by slicing it and that is infact a single statement. However for generators version, the new file is being populated by reading the input file line by line and this makes the memory profiler profile for every single line which amounts to increased cpu time.

Memory optimization and RAM extend with HDF5 or Pickle

I have a load a very large datafile which is bigger than my RAM. I try to do that both with Pickle and HDF5 but the data are loaded in memory.
Is there a way to access data without load them on memory but accessing to them directly on the Disk ?
from memory_profiler import profile
import numpy as np
import pandas as pd
import cPickle
import gc
import time
basepath = '/Users/toto/Desktop/'
#profile
def test_write():
dim = 10000000
df = pd.DataFrame({'test':range(dim)}, index=range(dim))
for i in range(30):
df[str(i)]=df['test'] * np.random.normal(0,1)
print 'df created'
cPickle.dump(df, open(basepath + 'df_pickle', 'wb'))
gc.collect()
store = pd.HDFStore(basepath + 'df_HDFpd')
store['df'] = df
store.close()
gc.collect()
del df
gc.collect()
#profile
def test_read(method):
print method
if method == 'pickle':
df = cPickle.load(open(basepath + 'df_pickle', 'rb'))
if method == 'HDF':
store = pd.HDFStore(basepath + 'df_HDFpd')
df = store['df']
print df.head(5)
try:
store.close()
except:
pass
#test_write()
timer = time.time()
test_read('HDF')
print 'Execution time: 'time.time()-timer
Result for test_write():
Line # Mem usage Increment Line Contents
================================================
12 42.5 MiB 0.0 MiB #profile
13 def test_write():
14 42.5 MiB 0.0 MiB dim = 10000000
15 969.4 MiB 926.8 MiB df = pd.DataFrame({'test':range(dim)}, index=range(dim))
16 3029.7 MiB 2060.3 MiB for i in range(30):
17 3029.7 MiB 0.0 MiB df[str(i)]=df['test'] * np.random.normal(0,1)
18
19 3029.7 MiB 0.0 MiB print 'df created'
20 3029.7 MiB 0.1 MiB cPickle.dump(df, open(basepath + 'df_pickle', 'wb'))
21 2616.7 MiB -413.0 MiB gc.collect()
22 2619.7 MiB 3.0 MiB store = pd.HDFStore(basepath + 'df_HDFpd')
23 2695.3 MiB 75.5 MiB store['df'] = df
24 2695.4 MiB 0.1 MiB store.close()
25 2696.1 MiB 0.7 MiB gc.collect()
26 1319.8 MiB -1376.3 MiB del df
27 1319.8 MiB 0.0 MiB gc.collect()
Result for test_load('HDF'):
Line # Mem usage Increment Line Contents
================================================
29 42.5 MiB 0.0 MiB
30 #profile
31 42.5 MiB 0.0 MiB def test_read(method):
32 42.5 MiB 0.0 MiB print method
33 if method == 'pickle':
34 42.5 MiB 0.0 MiB df = cPickle.load(open(basepath + 'df_pickle', 'rb'))
35 46.7 MiB 4.2 MiB if method == 'HDF':
36 2488.7 MiB 2442.0 MiB store = pd.HDFStore(basepath + 'df_HDFpd')
37 2489.2 MiB 0.5 MiB df = store['df']
38 print df.head(5)
39 2489.2 MiB 0.0 MiB
40 2489.2 MiB 0.0 MiB try:
41 store.close()
42 except:
43 pass
Result for test_load('cPickle'):
to come in few minutes
If you use h5py, when you index into an H5File it gives you something which is not a NumPy array, but is convertible to one. So you should slice that, or operate on it directly in some way, which can avoid reading the entire thing into memory at once.
I haven't used HDFs yet, but it looks like you can read an HDF in incrementally with pandas.read_hdf(), either using the start/stop arguments or by getting it to return an iterator.

Categories