I Have a big file about 8G to sort. and i split it into blocks(each 3G) then sort it, finally write sorted data into disk, when writting to disk, memory usage increase immediately even to 9G. i think the memory use shouldn't be so large, maybe the memeory didn't release after used, when i add del lines & gc.collect(), the memory use become normal,but the cost of time become 2 times of before, anyone knows why?
Here is my code
```
class Sorter(object):
def sort(self, filename='input', inp..ut_stream=None, out_filename=None, key=KEY_BY):
sort_key = key
def getLines(fname):
for _ in open(fname, 'r'):
yield (sort_key(_), _)
st = time.time()
splitter = FileSplitter(filename)
if input_stream:
splitter.split(self.block_size, input_stream=input_stream, key=sort_key, buffer_size=self.buffer_size)
else:
splitter.split(self.block_size, key=sort_key, buffer_size=self.buffer_size)
# splitter.split(self.block_size, sort_key, self.buffer_size)
print >> sys.stderr, 'sort', time.time() - st
st = time.time()
filelist = map(getLines, splitter.get_block_filenames())
r = heapq.merge(*filelist)
if not out_filename:
f = open(filename + '.out', 'w')
else:
f = open(out_filename, 'w')
map(lambda _: f.write(_[1]), r)
print >> sys.stderr, 'merge', time.time() - st
splitter.cleanup()
class FileSplitter(object):
BLOCK_FILENAME_FORMAT = 'block_{0}.dat'
def __init__(self, filename):
self.filename = filename
self.block_filenames = []
def write_block(self, data, block_number, buffer_size):
filename = self.BLOCK_FILENAME_FORMAT.format(block_number)
file = open(filename, 'w', buffer_size)
file.write(data)
file.close()
self.block_filenames.append(filename)
def get_block_filenames(self):
return self.block_filenames
def split(self, block_size, input_stream=None, key=None, buffer_size=0):
sort_key = key
if not input_stream:
file = open(self.filename, 'r', buffer_size)
else:
file = input_stream
i = 0
while True:
lines = file.readlines(block_size)
if lines == []:
break
if sort_key is None:
lines.sort()
else:
lines.sort(key=sort_key)
self.write_block(''.join(lines), i, buffer_size=buffer_size)
i += 1
def cleanup(self):
map(lambda f: os.remove(f), self.block_filenames)
```
Here is the usage of memory in each line:
```
Line # Mem usage Increment Line Contents
21 6817.344 MiB 18170.859 MiB #profile
22 def write_block(self, data, block_number, buffer_size):
23 6817.344 MiB -1652.430 MiB filename = self.BLOCK_FILENAME_FORMAT.format(block_number)
24 6817.344 MiB -1652.430 MiB file = open(filename, 'w', buffer_size)
25 6817.344 MiB -1652.430 MiB file.write(data)
26 6817.344 MiB -1652.430 MiB file.close()
27 6817.344 MiB -1652.430 MiB self.block_filenames.append(filename)
Filename: ../../isorter.py
Line # Mem usage Increment Line Contents
32 21.504 MiB 21.504 MiB #profile
33 def split(self, block_size, input_stream=None, key=None, buffer_size=0):
36 21.504 MiB 0.000 MiB sort_key = key
37 21.504 MiB 0.000 MiB if not input_stream:
38 file = open(self.filename, 'r', buffer_size)
39 else:
40 21.504 MiB 0.000 MiB file = input_stream
41 21.504 MiB 0.000 MiB i = 0
43 5164.914 MiB 0.000 MiB while True:
44 5164.801 MiB 4979.527 MiB lines = file.readlines(block_size)
45 5164.801 MiB -157.926 MiB if lines == []:
46 5006.875 MiB -157.926 MiB break
47 5164.801 MiB 0.000 MiB if sort_key is None:
48 lines.sort()
49 else:
50 5164.914 MiB 5.844 MiB lines.sort(key=sort_key)
51 5164.914 MiB 12020.230 MiB self.write_block(''.join(lines), i, buffer_size=buffer_size)
52 5164.914 MiB 0.000 MiB i += 1
53 5006.879 MiB 0.004 MiB print
Filename: ../../isorter.py
Line # Mem usage Increment Line Contents
56 5006.906 MiB 5006.906 MiB #profile
57 def cleanup(self):
58 5006.906 MiB 0.000 MiB map(lambda f: os.remove(f), self.block_filenames)
Filename: ../../isorter.py
Line # Mem usage Increment Line Contents
61 5006.898 MiB 5006.898 MiB #profile
62 def merge(filelist):
63 5006.898 MiB 0.000 MiB return heapq.merge(*filelist)
Filename: ../../isorter.py
Line # Mem usage Increment Line Contents
71 21.340 MiB 21.340 MiB #profile
72 def sort(self, filename='input', input_stream=None, out_filename=None, key=KEY_BY):
76 21.504 MiB 0.000 MiB sort_key = key
77 5006.902 MiB 0.000 MiB def getLines(fname):
78 5006.918 MiB -189.023 MiB for _ in open(fname, 'r'):
79 5006.918 MiB -378.035 MiB yield (sort_key(_), _)
80 21.504 MiB 0.000 MiB st = time.time()
83 21.504 MiB 0.000 MiB splitter = FileSplitter(filename)
84 21.504 MiB 0.000 MiB if input_stream:
85 5006.879 MiB 5006.879 MiB splitter.split(self.block_size, input_stream=input_stream, key=sort_key, buffer_size=self.buffer_size)
86 else:
87 splitter.split(self.block_size, key=sort_key, buffer_size=self.buffer_size)
88 5006.898 MiB 0.020 MiB print >> sys.stderr, 'sort', time.time() - st
90 5006.898 MiB 0.000 MiB st = time.time()
91 5006.898 MiB 0.000 MiB filelist = map(getLines, splitter.get_block_filenames())
94 5006.898 MiB 5006.898 MiB r = merge(filelist)
96 5006.898 MiB 0.000 MiB if not out_filename:
97 f = open(filename + '.out', 'w')
98 else:
99 5006.898 MiB 0.000 MiB f = open(out_filename, 'w')
100 5006.918 MiB -378.051 MiB map(lambda : f.write([1]), r)
101 5006.906 MiB -0.012 MiB print >> sys.stderr, 'merge', time.time() - st
102 5006.906 MiB 5006.906 MiB splitter.cleanup()
```
You can use numpy array instead of python's list. Numpy array use less memory than list.
for example if your code is :
x = []
for a in mylist:
x.append(a)
replace that with :
import numpy as np
x = np.array([])
for a in mylist:
x = np.append(x,a)
Related
I want to generate and keep a set of tuples in a certain time. Yet I found the program seemed to consume all the memory if given enough time.
I have tried two methods. One is delete the newly generated variables, the other is gc.collect(). But neither of them worked. If I just generate and not keep the tuples, the program would consume limited memory.
generate and keep: gk.py
import gc
import time
from memory_profiler import profile
from random import sample
from sys import getsizeof
#profile
def loop(limit):
t = time.time()
i = 0
A = set()
while True:
i += 1
duration = time.time() - t
a = tuple(sorted(sample(range(200), 100)))
A.add(a)
if not i % int(1e4):
print('step {:.2e}...'.format(i))
if duration > limit:
print('done')
break
# method 1: delete the variables
# del duration, a
# method 2: use gc
# gc.collect()
memory = getsizeof(t) + getsizeof(i) + getsizeof(duration) + \
getsizeof(a) + getsizeof(limit) + getsizeof(A)
print('memory consumed: {:.2e}MB'.format(memory/2**20))
pass
def main():
limit = 300
loop(limit)
pass
if __name__ == '__main__':
print('running...')
main()
generate and not keep: gnk.py
import time
from memory_profiler import profile
from random import sample
from sys import getsizeof
#profile
def loop(limit):
t = time.time()
i = 0
while True:
i += 1
duration = time.time() - t
a = tuple(sorted(sample(range(200), 100)))
if not i % int(1e4):
print('step {:.2e}...'.format(i))
if duration > limit:
print('done')
break
memory = getsizeof(t) + getsizeof(i) + getsizeof(duration) + \
getsizeof(a) + getsizeof(limit)
print('memory consumed: {:.2e}MB'.format(memory/2**20))
pass
def main():
limit = 300
loop(limit)
pass
if __name__ == '__main__':
print('running...')
main()
use "mprof" (needs module memory_profiler) in cmd/shell to check memory usage
mprof run my_file.py
mprof plot
result of gk.py
memory consumed: 4.00e+00MB
Filename: gk.py
Line # Mem usage Increment Line Contents
================================================
12 32.9 MiB 32.9 MiB #profile
13 def loop(limit):
14 32.9 MiB 0.0 MiB t = time.time()
15 32.9 MiB 0.0 MiB i = 0
16 32.9 MiB 0.0 MiB A = set()
17 32.9 MiB 0.0 MiB while True:
18 115.8 MiB 0.0 MiB i += 1
19 115.8 MiB 0.0 MiB duration = time.time() - t
20 115.8 MiB 0.3 MiB a = tuple(sorted(sample(range(200), 100)))
21 115.8 MiB 2.0 MiB A.add(a)
22 115.8 MiB 0.0 MiB if not i % int(1e4):
23 111.8 MiB 0.0 MiB print('step {:.2e}...'.format(i))
24 115.8 MiB 0.0 MiB if duration > limit:
25 115.8 MiB 0.0 MiB print('done')
26 115.8 MiB 0.0 MiB break
27 # method 1: delete the variables
28 # del duration, a
29 # method 2: use gc
30 # gc.collect()
31 memory = getsizeof(t) + getsizeof(i) + getsizeof(duration) + \
32 115.8 MiB 0.0 MiB getsizeof(a) + getsizeof(limit) + getsizeof(A)
33 115.8 MiB 0.0 MiB print('memory consumed: {:.2e}MB'.format(memory/2**20))
34 115.8 MiB 0.0 MiB pass
result of gnk.py
memory consumed: 9.08e-04MB
Filename: gnk.py
Line # Mem usage Increment Line Contents
================================================
11 33.0 MiB 33.0 MiB #profile
12 def loop(limit):
13 33.0 MiB 0.0 MiB t = time.time()
14 33.0 MiB 0.0 MiB i = 0
15 33.0 MiB 0.0 MiB while True:
16 33.0 MiB 0.0 MiB i += 1
17 33.0 MiB 0.0 MiB duration = time.time() - t
18 33.0 MiB 0.1 MiB a = tuple(sorted(sample(range(200), 100)))
19 33.0 MiB 0.0 MiB if not i % int(1e4):
20 33.0 MiB 0.0 MiB print('step {:.2e}...'.format(i))
21 33.0 MiB 0.0 MiB if duration > limit:
22 33.0 MiB 0.0 MiB print('done')
23 33.0 MiB 0.0 MiB break
24 memory = getsizeof(t) + getsizeof(i) + getsizeof(duration) + \
25 33.0 MiB 0.0 MiB getsizeof(a) + getsizeof(limit)
26 33.0 MiB 0.0 MiB print('memory consumed: {:.2e}MB'.format(memory/2**20))
27 33.0 MiB 0.0 MiB pass
I have two problems:
both the programs consumed more memory than the variables occupied. "gk.py" consumed 115.8MB, its variables occupied 4.00MB. "gnk.py" consumed 33.0MB, its variables occupied 9.08e-04MB. Why the programs consumed more memory than the corresponding variables occupied?
memory that "gk.py" consumed increases linearly with time. memory that "gnk.py" consumed remains constantly with time. Why does this happen?
Any help would be appreciated.
Given that the size of the set is being constantly increased, there will be a time when it will eventually consume all memory.
An estimative (from my computer):
10 seconds of code running ~ 5e4 tuples saved to the set
300 seconds of code running ~ 1.5e6 tuples saved to the set
1 tuple = 100 integers ~ 400bytes
total:
1.5e6 * 400bytes = 6e8bytes = 600MB filled in 300s
I am writing a simple application which splits a large text file into smaller files, and i have written 2 versions of it, one using lists and one using generators. I profiled both the versions using memory_profiler module and it clearly showed the better memory efficiency of the generators version, however strangely enough when then version using generators was profiled, it increases the execution time. Below demonstration explains what i mean
Version using Lists
from memory_profiler import profile
#profile()
def main():
file_name = input("Enter the full path of file you want to split into smaller inputFiles: ")
input_file = open(file_name).readlines()
num_lines_orig = len(input_file)
parts = int(input("Enter the number of parts you want to split in: "))
output_files = [(file_name + str(i)) for i in range(1, parts + 1)]
st = 0
p = int(num_lines_orig / parts)
ed = p
for i in range(parts-1):
with open(output_files[i], "w") as OF:
OF.writelines(input_file[st:ed])
st = ed
ed = st + p
with open(output_files[-1], "w") as OF:
OF.writelines(input_file[st:])
if __name__ == "__main__":
main()
when run with profiler
$ time py36 Splitting\ text\ files_BAD_usingLists.py
Enter the full path of file you want to split into smaller inputFiles: /apps/nttech/rbhanot/Downloads/test.txt
Enter the number of parts you want to split in: 3
Filename: Splitting text files_BAD_usingLists.py
Line # Mem usage Increment Line Contents
================================================
6 47.8 MiB 0.0 MiB #profile()
7 def main():
8 47.8 MiB 0.0 MiB file_name = input("Enter the full path of file you want to split into smaller inputFiles: ")
9 107.3 MiB 59.5 MiB input_file = open(file_name).readlines()
10 107.3 MiB 0.0 MiB num_lines_orig = len(input_file)
11 107.3 MiB 0.0 MiB parts = int(input("Enter the number of parts you want to split in: "))
12 107.3 MiB 0.0 MiB output_files = [(file_name + str(i)) for i in range(1, parts + 1)]
13 107.3 MiB 0.0 MiB st = 0
14 107.3 MiB 0.0 MiB p = int(num_lines_orig / parts)
15 107.3 MiB 0.0 MiB ed = p
16 108.1 MiB 0.7 MiB for i in range(parts-1):
17 107.6 MiB -0.5 MiB with open(output_files[i], "w") as OF:
18 108.1 MiB 0.5 MiB OF.writelines(input_file[st:ed])
19 108.1 MiB 0.0 MiB st = ed
20 108.1 MiB 0.0 MiB ed = st + p
21
22 108.1 MiB 0.0 MiB with open(output_files[-1], "w") as OF:
23 108.1 MiB 0.0 MiB OF.writelines(input_file[st:])
real 0m6.115s
user 0m0.764s
sys 0m0.052s
When run without profiler
$ time py36 Splitting\ text\ files_BAD_usingLists.py
Enter the full path of file you want to split into smaller inputFiles: /apps/nttech/rbhanot/Downloads/test.txt
Enter the number of parts you want to split in: 3
real 0m5.916s
user 0m0.696s
sys 0m0.080s
Now the one using generators
#profile()
def main():
file_name = input("Enter the full path of file you want to split into smaller inputFiles: ")
input_file = open(file_name)
num_lines_orig = sum(1 for _ in input_file)
input_file.seek(0)
parts = int(input("Enter the number of parts you want to split in: "))
output_files = ((file_name + str(i)) for i in range(1, parts + 1))
st = 0
p = int(num_lines_orig / parts)
ed = p
for i in range(parts-1):
file = next(output_files)
with open(file, "w") as OF:
for _ in range(st, ed):
OF.writelines(input_file.readline())
st = ed
ed = st + p
if num_lines_orig - ed < p:
ed = st + (num_lines_orig - ed) + p
else:
ed = st + p
file = next(output_files)
with open(file, "w") as OF:
for _ in range(st, ed):
OF.writelines(input_file.readline())
if __name__ == "__main__":
main()
When run with profiler option
$ time py36 -m memory_profiler Splitting\ text\ files_GOOD_usingGenerators.py
Enter the full path of file you want to split into smaller inputFiles: /apps/nttech/rbhanot/Downloads/test.txt
Enter the number of parts you want to split in: 3
Filename: Splitting text files_GOOD_usingGenerators.py
Line # Mem usage Increment Line Contents
================================================
4 47.988 MiB 0.000 MiB #profile()
5 def main():
6 47.988 MiB 0.000 MiB file_name = input("Enter the full path of file you want to split into smaller inputFiles: ")
7 47.988 MiB 0.000 MiB input_file = open(file_name)
8 47.988 MiB 0.000 MiB num_lines_orig = sum(1 for _ in input_file)
9 47.988 MiB 0.000 MiB input_file.seek(0)
10 47.988 MiB 0.000 MiB parts = int(input("Enter the number of parts you want to split in: "))
11 48.703 MiB 0.715 MiB output_files = ((file_name + str(i)) for i in range(1, parts + 1))
12 47.988 MiB -0.715 MiB st = 0
13 47.988 MiB 0.000 MiB p = int(num_lines_orig / parts)
14 47.988 MiB 0.000 MiB ed = p
15 48.703 MiB 0.715 MiB for i in range(parts-1):
16 48.703 MiB 0.000 MiB file = next(output_files)
17 48.703 MiB 0.000 MiB with open(file, "w") as OF:
18 48.703 MiB 0.000 MiB for _ in range(st, ed):
19 48.703 MiB 0.000 MiB OF.writelines(input_file.readline())
20
21 48.703 MiB 0.000 MiB st = ed
22 48.703 MiB 0.000 MiB ed = st + p
23 48.703 MiB 0.000 MiB if num_lines_orig - ed < p:
24 48.703 MiB 0.000 MiB ed = st + (num_lines_orig - ed) + p
25 else:
26 48.703 MiB 0.000 MiB ed = st + p
27
28 48.703 MiB 0.000 MiB file = next(output_files)
29 48.703 MiB 0.000 MiB with open(file, "w") as OF:
30 48.703 MiB 0.000 MiB for _ in range(st, ed):
31 48.703 MiB 0.000 MiB OF.writelines(input_file.readline())
real 1m48.071s
user 1m13.144s
sys 0m19.652s
When run without profiler
$ time py36 Splitting\ text\ files_GOOD_usingGenerators.py
Enter the full path of file you want to split into smaller inputFiles: /apps/nttech/rbhanot/Downloads/test.txt
Enter the number of parts you want to split in: 3
real 0m10.429s
user 0m3.160s
sys 0m0.016s
So why profiling is making my code slow first of all ? Secondly if at profiling impacts execution speed, then why this effect is not showing on version of the code using lists.
I cpu_profiled the code using line_profiler and i got the answer this time, the reason why generator's version takes more time is because of below lines
19 2 11126.0 5563.0 0.2 with open(file, "w") as OF:
20 379886 200418.0 0.5 3.0 for _ in range(st, ed):
21 379884 2348653.0 6.2 35.1 OF.writelines(input_file.readline())
And why it does not slows down for lists version is because
19 2 9419.0 4709.5 0.4 with open(output_files[i], "w") as OF:
20 2 1654165.0 827082.5 65.1 OF.writelines(input_file[st:ed])
For lists, the new file is being written by simply taking a copy of the list by slicing it and that is infact a single statement. However for generators version, the new file is being populated by reading the input file line by line and this makes the memory profiler profile for every single line which amounts to increased cpu time.
I have a simple code, which reads csv file, looks for duplicates based on first 2 colums and then writes the duplicates in another csv and keeps unique values in third csv...
I am using set:
def my_func():
area = "W09"
inf = r'f:\JDo\Cleaned\_merged\\'+ area +'.csv'
out = r'f:\JDo\Cleaned\_merged\no_duplicates\\'+area+'_no_duplicates.csv'
out2 = r'f:\JDo\Cleaned\_merged\duplicates\\'+area+"_duplicates.csv"
#i = 0
seen = set()
with open(inf, 'r') as infile, open(out, 'w') as outfile1, open(out2, 'w') as outfile2:
reader = csv.reader(infile, delimiter=" ")
writer1 = csv.writer(outfile1, delimiter=" ")
writer2 = csv.writer(outfile2, delimiter=" ")
for row in reader:
x, y = row[0], row[1]
x = float(x)
y = float(y)
if (x, y) in seen:
writer2.writerow(row)
continue
seen.add((x, y))
writer1.writerow(row)
seen.clear()
I thought, that set would be the best choice, but the size of the set is seven times the size of the input file? (input files ranging from 140 MB to 50GB csv) and RAM usage from 1GB to almost 400 GB (I am using a server with 768 GB of RAM):
I also used profiler on the small sample
Line # Mem usage Increment Line Contents
8 21.289 MiB 21.289 MiB #profile
9 def my_func():
10 21.293 MiB 0.004 MiB area = "W10"
11
12 21.293 MiB 0.000 MiB inf = r'f:\JDo\Cleaned\_merged\\'+ area +'.csv'
13 21.293 MiB 0.000 MiB out = r'f:\JDo\Cleaned\_merged\no_duplicates\\'+area+'_no_duplicates.csv'
14 21.297 MiB 0.004 MiB out2 = r'f:\JDo\Cleaned\_merged\duplicates\\'+area+"_duplicates.csv"
15
16
17
18 #i = 0
19 21.297 MiB 0.000 MiB seen = set()
20
21 21.297 MiB 0.000 MiB with open(inf, 'r') as infile, open(out,'w') as outfile1, open(out2, 'w') as outfile2:
22 21.297 MiB 0.000 MiB reader = csv.reader(infile, delimiter=" ")
23 21.297 MiB 0.000 MiB writer1 = csv.writer(outfile1, delimiter=" ")
24 21.297 MiB 0.000 MiB writer2 = csv.writer(outfile2, delimiter=" ")
25 1089.914 MiB -9.008 MiB for row in reader:
26 1089.914 MiB -7.977 MiB x, y = row[0], row[1]
27
28 1089.914 MiB -6.898 MiB x = float(x)
29 1089.914 MiB 167.375 MiB y = float(y)
30
31 1089.914 MiB 166.086 MiB if (x, y) in seen:
32 #z = line.split(" ",3)[-1]
33 #if z == "5284":
34 # print X, Y, z
35
36 1089.914 MiB 0.004 MiB writer2.writerow(row)
37 1089.914 MiB 0.000 MiB continue
38 1089.914 MiB 714.102 MiB seen.add((x, y))
39 1089.914 MiB -9.301 MiB writer1.writerow(row)
40
41
42
43 690.426 MiB -399.488 MiB seen.clear()
What could be the issue? is there a faster way to filter out the result?
Or a way which uses less way RAM?
Sample of csv:
We are looking at GeoTIFF converted into csv file so it is X Y Value
475596 101832 4926
475626 101832 4926
475656 101832 4926
475686 101832 4926
475716 101832 4926
475536 101802 4926
475566 101802 4926
475596 101802 4926
475626 101802 4926
475656 101802 4926
475686 101802 4926
475716 101802 4926
475746 101802 4926
475776 101802 4926
475506 101772 4926
475536 101772 4926
475566 101772 4926
475596 101772 4926
475626 101772 4926
475656 101772 4926
475686 101772 4926
475716 101772 4926
475746 101772 4926
475776 101772 4926
475806 101772 4926
475836 101772 4926
475476 101742 4926
475506 101742 4926
EDIT:
So i tried the solution offered by Jean:
https://stackoverflow.com/a/49008391/9418396
Result is that on my small set of 140 MB csv the size of set is now halfed, which is a good improvement. I will try to run it on the bigger data, and see what it does. I can't really link it to profiler, because the profiler prolongs the execution time by huge amount of time.
Line # Mem usage Increment Line Contents
8 21.273 MiB 21.273 MiB #profile
9 def my_func():
10 21.277 MiB 0.004 MiB area = "W10"
11
12 21.277 MiB 0.000 MiB inf = r'f:\JDo\Cleaned\_merged\\'+ area +'.csv'
13 21.277 MiB 0.000 MiB out = r'f:\JDo\Cleaned\_merged\no_duplicates\\'+area+'_no_duplicates.csv'
14 21.277 MiB 0.000 MiB out2 = r'f:\JDo\Cleaned\_merged\duplicates\\'+area+"_duplicates.csv"
15
16
17 21.277 MiB 0.000 MiB seen = set()
18
19 21.277 MiB 0.000 MiB with open(inf, 'r') as infile, open(out,'w') as outfile1, open(out2, 'w') as outfile2:
20 21.277 MiB 0.000 MiB reader = csv.reader(infile, delimiter=" ")
21 21.277 MiB 0.000 MiB writer1 = csv.writer(outfile1, delimiter=" ")
22 21.277 MiB 0.000 MiB writer2 = csv.writer(outfile2, delimiter=" ")
23 451.078 MiB -140.355 MiB for row in reader:
24 451.078 MiB -140.613 MiB hash = float(row[0])*10**7 + float(row[1])
25 #x, y = row[0], row[1]
26
27 #x = float(x)
28 #y = float(y)
29
30 #if (x, y) in seen:
31 451.078 MiB 32.242 MiB if hash in seen:
32 451.078 MiB 0.000 MiB writer2.writerow(row)
33 451.078 MiB 0.000 MiB continue
34 451.078 MiB 78.500 MiB seen.add((hash))
35 451.078 MiB -178.168 MiB writer1.writerow(row)
36
37 195.074 MiB -256.004 MiB seen.clear()
you could create your own hash function to avoid storing the tuple of floats, but a float value holding the floats combined together in an unique way.
Let's say that coordinates cannot exceed 10 million (maybe you could go down to 1 million), you could do:
hash = x*10**7 + y
(this performs a kind of logical "OR" on your floats, and since values are limited, there's no mixing up between x and y)
Then put hash in your set instead of a tuple of floats. There's no risk of float absorption with 10**14 that'd be worth a try:
>>> 10**14+1.5
100000000000001.5
the loop then becomes:
for row in reader:
hash = float(row[0])*10**7 + float(row[1])
if hash in seen:
writer2.writerow(row)
continue
seen.add(hash)
writer1.writerow(row)
one float, even big (since the size of a float is fixed), is at least 2 or 3 times smaller in memory than a tuple of 2 floats. On my machine:
>>> sys.getsizeof((0.44,0.2))
64
>>> sys.getsizeof(14252362*10**7+35454555.0)
24
I wanted to use this example https://github.com/davidsandberg/facenet/blob/master/contributed/real_time_face_recognition.py but for running face recognition on a set of images. My main function looks as follows
def main(args):
face_recognition = face.Recognition()
if args.debug:
face.debug = True
path = './images/'
folders = os.listdir(path)
for f in folders:
images = os.listdir(path + f)
for i in images:
img = cv2.imread(path + f + '/' + i)
start = time.time()
faces = face_recognition.identify(img)
for face_n in faces:
print(i, '-', face_n.name, '-', time.time() - start)
del images, path, img, faces, folders
When I checked how RAM is using, I founded a memory leak:
> 183.5 MiB 183.5 MiB #profile(stream=fp)
> def main(args):
> 383.7 MiB 200.2 MiB face_recognition = face.Recognition()
> 383.7 MiB 0.0 MiB if args.debug:
> face.debug = True
>
> 383.7 MiB 0.0 MiB path = './images/'
> 383.7 MiB 0.0 MiB folders = os.listdir(path)
> 1122.2 MiB -9.7 MiB for f in folders[:5]:
> 1113.9 MiB 0.0 MiB images = os.listdir(path + f)
> 1124.9 MiB -55.1 MiB for i in images:
> 1124.9 MiB -41.9 MiB img = cv2.imread(path + f + '/' + i)
> 1124.9 MiB -52.4 MiB start = time.time()
> 1124.9 MiB 679.9 MiB faces = face_recognition.identify(img)
> 1124.9 MiB -112.1 MiB for face_n in faces:
> 1124.9 MiB -52.7 MiB print(i, '-', face_n.name, '-', time.time() - start)
> 1122.2 MiB 0.0 MiB del images, path, img, faces, folders
Looks like the main leak is in https://github.com/davidsandberg/facenet/blob/master/contributed/face.py
> 141 389.0 MiB 389.0 MiB #profile(stream=fp)
> 142 def find_faces(self, image):
> 143 389.0 MiB 0.0 MiB faces = []
> 144
> 145 389.0 MiB 0.0 MiB bounding_boxes, _ = detect_face.detect_face(image, self.minsize,
> 146 389.0 MiB 0.0 MiB self.pnet, self.rnet, self.onet,
> 147 486.7 MiB 97.7 MiB self.threshold, self.factor)
> 148 486.7 MiB 0.0 MiB for bb in bounding_boxes:
How to solve this issue when I need to use loops?
I have a load a very large datafile which is bigger than my RAM. I try to do that both with Pickle and HDF5 but the data are loaded in memory.
Is there a way to access data without load them on memory but accessing to them directly on the Disk ?
from memory_profiler import profile
import numpy as np
import pandas as pd
import cPickle
import gc
import time
basepath = '/Users/toto/Desktop/'
#profile
def test_write():
dim = 10000000
df = pd.DataFrame({'test':range(dim)}, index=range(dim))
for i in range(30):
df[str(i)]=df['test'] * np.random.normal(0,1)
print 'df created'
cPickle.dump(df, open(basepath + 'df_pickle', 'wb'))
gc.collect()
store = pd.HDFStore(basepath + 'df_HDFpd')
store['df'] = df
store.close()
gc.collect()
del df
gc.collect()
#profile
def test_read(method):
print method
if method == 'pickle':
df = cPickle.load(open(basepath + 'df_pickle', 'rb'))
if method == 'HDF':
store = pd.HDFStore(basepath + 'df_HDFpd')
df = store['df']
print df.head(5)
try:
store.close()
except:
pass
#test_write()
timer = time.time()
test_read('HDF')
print 'Execution time: 'time.time()-timer
Result for test_write():
Line # Mem usage Increment Line Contents
================================================
12 42.5 MiB 0.0 MiB #profile
13 def test_write():
14 42.5 MiB 0.0 MiB dim = 10000000
15 969.4 MiB 926.8 MiB df = pd.DataFrame({'test':range(dim)}, index=range(dim))
16 3029.7 MiB 2060.3 MiB for i in range(30):
17 3029.7 MiB 0.0 MiB df[str(i)]=df['test'] * np.random.normal(0,1)
18
19 3029.7 MiB 0.0 MiB print 'df created'
20 3029.7 MiB 0.1 MiB cPickle.dump(df, open(basepath + 'df_pickle', 'wb'))
21 2616.7 MiB -413.0 MiB gc.collect()
22 2619.7 MiB 3.0 MiB store = pd.HDFStore(basepath + 'df_HDFpd')
23 2695.3 MiB 75.5 MiB store['df'] = df
24 2695.4 MiB 0.1 MiB store.close()
25 2696.1 MiB 0.7 MiB gc.collect()
26 1319.8 MiB -1376.3 MiB del df
27 1319.8 MiB 0.0 MiB gc.collect()
Result for test_load('HDF'):
Line # Mem usage Increment Line Contents
================================================
29 42.5 MiB 0.0 MiB
30 #profile
31 42.5 MiB 0.0 MiB def test_read(method):
32 42.5 MiB 0.0 MiB print method
33 if method == 'pickle':
34 42.5 MiB 0.0 MiB df = cPickle.load(open(basepath + 'df_pickle', 'rb'))
35 46.7 MiB 4.2 MiB if method == 'HDF':
36 2488.7 MiB 2442.0 MiB store = pd.HDFStore(basepath + 'df_HDFpd')
37 2489.2 MiB 0.5 MiB df = store['df']
38 print df.head(5)
39 2489.2 MiB 0.0 MiB
40 2489.2 MiB 0.0 MiB try:
41 store.close()
42 except:
43 pass
Result for test_load('cPickle'):
to come in few minutes
If you use h5py, when you index into an H5File it gives you something which is not a NumPy array, but is convertible to one. So you should slice that, or operate on it directly in some way, which can avoid reading the entire thing into memory at once.
I haven't used HDFs yet, but it looks like you can read an HDF in incrementally with pandas.read_hdf(), either using the start/stop arguments or by getting it to return an iterator.