too many files open error with multiprocessing - python

I have a code that uses multiprocessing over about 10000 files on a 12 core vcpu on Ubuntu.
def process_file(name):
inp = open(name)
out = open(name.split('.')[0]+'wikiout.txt','a')
for row in inp:
row = row.strip()
sent_text = nltk.sent_tokenize(text)
for sent in sent_text:
# process sentence
inp.close()
out.close()
if __name__ == '__main__':
processes = []
for i in 'ABCDEF':
for j in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ':
for k in range(100)
filename = os.path.join(os.path.dirname(__file__), (i + j + '/' + 'wiki_' + str(k) + '.txt'))
p = multiprocessing.Process(target=process_file, args=(filename,))
processes.append(p)
p.start()
for process in processes:
process.join()
For some reason I get this issue
File "wikirules.py", line 37, in <module>
p.start()
File "/usr/lib/python3.8/multiprocessing/process.py", line 121, in start
self._popen = self._Popen(self)
File "/usr/lib/python3.8/multiprocessing/context.py", line 224, in _Popen
return _default_context.get_context().Process._Popen(process_obj)
File "/usr/lib/python3.8/multiprocessing/context.py", line 277, in _Popen
return Popen(process_obj)
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 19, in __init__
self._launch(process_obj)
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 69, in _launch
child_r, parent_w = os.pipe()
OSError: [Errno 24] Too many open files
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
File "wikirules.py", line 13, in process_file
File "/usr/local/lib/python3.8/dist-packages/nltk/tokenize/__init__.py", line 106, in sent_tokenize
File "/usr/local/lib/python3.8/dist-packages/nltk/data.py", line 752, in load
File "/usr/local/lib/python3.8/dist-packages/nltk/data.py", line 877, in _open
File "/usr/local/lib/python3.8/dist-packages/nltk/data.py", line 327, in open
OSError: [Errno 24] Too many open files: '/root/nltk_data/tokenizers/punkt/PY3/english.pickle'
Any clue why this might be happening? Im still new to multiprocessing. So shouldn't this not open more than 12 files at once.

Your code is trying to run
len('ABCDEF') * len('ABCD...Z') * len(range(100)) = 6 * 26 * 100 = 15 600
operating system processes simultaneously.
Actually multiprocessing module contains relatively low level primitives to work with multiprocessing, and for basic tasks standard library suggests more safe and convenient option - module concurrent.futures which contains Pools implementations for threads and processes, and could be very useful especially for "embarrassingly parallel" workloads.
Here is example how the code from your question could be transformed using concurrent.futures and some other python features like generators, context managers and pathlib module.
import concurrent.futures as futures
import itertools
import pathlib
import nltk
BASE_PATH = pathlib.Path(__file__).parent.absolute()
def filename_generator():
"""produce filenames sequence"""
for i, j, k in itertools.product("ABCDEF", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", range(100)):
yield BASE_PATH / f"{i}{j}/wiki_{k}.txt"
def worker(filename: pathlib.Path):
"""do all the job"""
out_filename = filename.with_suffix('.wikiout.txt')
with open(filename) as inp, open(out_filename, "a") as out:
for row in inp:
text = row.strip()
sent_text = nltk.sent_tokenize(text)
for sent in sent_text:
"""process sentence"""
def main():
with futures.ProcessPoolExecutor() as pool:
# mapping future->filename, useful in case of error
task_to_filename = {pool.submit(worker, f): f for f in filename_generator()}
for f in futures.as_completed(task_to_filename):
try:
f.result()
except Exception as e:
filename = task_to_filename[f]
print(f"{filename} processing failed: {e}")
if __name__ == "__main__":
main()

Related

Multiprocessing crashes on big-data (OSError: [Errno 24] Too many open files)

I'm using python3.8 to transform .mp4 videos from the dataset (~700GB) to images (1 frame from every second of video) using multiprocessing, code below.
import os
import sys
import cv2
from pathlib import Path
from multiprocessing import Process
base_path = 'path_to_dataset'
images_dir = 'path_to_dataset_images'
image_fps = 1
def task(path, name):
filename, file_extension = os.path.splitext(os.path.relpath(os.path.join(path, name), base_path))
if file_extension != '.mp4':
exit(0)
video_path = os.path.join(path, name)
image_path = os.path.join(images_dir, filename)
extension = ".png"
try:
cap = cv2.VideoCapture(video_path)
fps = round(cap.get(cv2.CAP_PROP_FPS))
hop = round(fps / image_fps)
curr_frame = 0
while True:
ret, frame = cap.read()
if not ret:
break
if curr_frame % hop == 0:
name = image_path + "_" + str(curr_frame) + extension
Path(os.path.dirname(name)).mkdir(parents=True, exist_ok=True)
if cv2.imwrite(name, frame):
print(name + " saved")
else:
print(name + " not saved!", file=sys.stderr)
curr_frame += 1
except:
print(image_path + " failed due to error", file=sys.stderr)
finally:
cap.release()
def convert_to_image():
# create all tasks
processes = []
for path, subdirs, files in os.walk(base_path):
processes = processes + [Process(target=task, args=(path, name,)) for name in files]
# start all processes
for process in processes:
process.start()
# wait for all processes to complete
for process in processes:
process.join()
# report that all tasks are completed
print('Done', flush=True)
if __name__ == "__main__":
convert_to_image()
Python gets the location of dataset files, creates a new folder with the dataset_image name, and maintains the folder structure of the original video dataset. The code is running with 8 cores and 128 GB of RAM. After like 8,5 hours of running (60+ hours of CPU time), it crashes on the error below. I've tried to use variants with or without a try-catch block without any improvement. Images are saved correctly as I want. Also, the data structure is perfectly fine.
Traceback (most recent call last):
File "convert_to_images.py", line 72, in <module>
convert_to_image()
File "convert_to_images.py", line 59, in convert_to_image
process.start()
File "/home/user/.conda/envs/tf2.9/lib/python3.8/multiprocessing/process.py", line 121, in start
self._popen = self._Popen(self)
File "/home/user/.conda/envs/tf2.9/lib/python3.8/multiprocessing/context.py", line 224, in _Popen
return _default_context.get_context().Process._Popen(process_obj)
File "/home/user/.conda/envs/tf2.9/lib/python3.8/multiprocessing/context.py", line 277, in _Popen
return Popen(process_obj)
File "/home/user/.conda/envs/tf2.9/lib/python3.8/multiprocessing/popen_fork.py", line 19, in __init__
self._launch(process_obj)
File "/home/user/.conda/envs/tf2.9/lib/python3.8/multiprocessing/popen_fork.py", line 69, in _launch
child_r, parent_w = os.pipe()
OSError: [Errno 24] Too many open files

Which Methods Open A File in Python

Im trying to run code which writes its outputs to a bunch of files (upwards of 250). Im currently using:
with open(self.file, self.mode, newline='') as file_writer:
writer = csv.writer(file_writer)
writer.writerow(row)
This should auto close my files but I'm still getting the following:
52 of 231 | Analysing player: Evolved ANN 5 Noise 05 ...
Traceback (most recent call last):
File "fullAnalysis.py", line 200, in <module>
run_one.start()
File "fullAnalysis.py", line 179, in start
print_output=False)
File "/home/vi/td/axelrod-dojo/src/axelrod_dojo/algorithms/genetic_algorithm.py", line 28, in __init__
self.pool = Pool(processes=self.processes)
File "/home/vi/.conda/envs/project/lib/python3.6/multiprocessing/context.py", line 119, in Pool
context=self.get_context())
File "/home/vi/.conda/envs/project/lib/python3.6/multiprocessing/pool.py", line 174, in __init__
self._repopulate_pool()
File "/home/vi/.conda/envs/project/lib/python3.6/multiprocessing/pool.py", line 239, in _repopulate_pool
w.start()
File "/home/vi/.conda/envs/project/lib/python3.6/multiprocessing/process.py", line 105, in start
self._popen = self._Popen(self)
File "/home/vi/.conda/envs/project/lib/python3.6/multiprocessing/context.py", line 277, in _Popen
return Popen(process_obj)
File "/home/vi/.conda/envs/project/lib/python3.6/multiprocessing/popen_fork.py", line 26, in __init__
self._launch(process_obj)
File "/home/vi/.conda/envs/project/lib/python3.6/multiprocessing/popen_fork.py", line 72, in _launch
parent_r, child_w = os.pipe()
OSError: [Errno 24] Too many open files
My question is, which methods open files in python so I can find this leak and plug it?
I currently the only file methods I'm useing is a mixture of:
os.makedirs()
os.remove()
csv.writerow() (as show with closure)
(This is the second time its happened on #52 after implementing the with closure code)
EDIT
Below is the main section of code at the top of the trace for fullAnalysis.py:
for opponent in self.opponent_list:
print(i, "of", len(self.opponent_list), "| Analysing player:", str(opponent), "...")
global_processes = 20
# Stochastic players need seeding
if opponent.classifier['stochastic']:
opponent = self._get_seeded_player_class(type(opponent))(self.global_seed)
global_processes = 1
population = axl_dojo.Population(params_class=axl_dojo.CyclerParams,
params_kwargs=cycler_kwargs,
size=POPULATION_SIZE,
# processes=global_processes,
population=getPreMadePop(POPULATION_SIZE),
objective=cycler_objective,
output_filename=self._get_file_name(opponent),
opponents=[opponent],
print_output=False)
population.run(GENERATION_LENGTH)
print("{:.2f}% Done.\tSaved to:".format((100 * i) / len(self.opponent_list)),
self._get_file_name(opponent))
TRACKER.print_diff()
# self.output_files[str(opponent)] = self._get_file_name(opponent)
i += 1
Below is the init code for the axelrod_dojo genetic_algorthm.py:
def __init__(self, params_class, params_kwargs, size, objective, output_filename,
bottleneck=None, mutation_probability=.1, opponents=None,
processes=1, weights=None,
sample_count=None, population=None, print_output=True):
self.params_class = params_class
self.bottleneck = bottleneck
self.print_output = print_output
if processes == 0:
self.processes = cpu_count()
else:
self.processes = processes
self.pool = Pool(processes=self.processes)

Memory issue with multiprocessing in Python

I am trying to use my other cores in my python program. And the following is the basic structure/logic of my code:
import multiprocessing as mp
import pandas as pd
import gc
def multiprocess_RUN(param):
result = Analysis_Obj.run(param)
return result
class Analysis_Obj():
def __init__(self, filename):
self.DF = pd.read_csv(filename)
return
def run_Analysis(self, param):
# Multi-core option
pool = mp.Pool(processes=1)
run_result = pool.map(multiprocess_RUN, [self, param])
# Normal option
run_result = self.run(param)
return run_result
def run(self, param):
# Let's say I have written a function to count the frequency of 'param' in the target file
result = count(self.DF, param)
return result
if __name__ == "__main__":
files = ['file1.csv', 'file2.csv']
params = [1,2,3,4]
results = []
for i in range(0,len(files)):
analysis = Analysis_Obj(files[i])
for j in range(0,len(params)):
result = analysis.run_Analysis(params[j])
results.append(result)
del result
del analysis
gc.collect()
If I comment out the 'Multi-core option' and run the 'Normal option' everything runs fine. But even if I run the 'Multi-core option' with processes=1 I get a Memory Error when my for loop starts on the 2nd file. I have deliberately set it up so that I create and delete an Analysis object in each for loop, so that the file that has been processed will be cleared from memory. Clearly this hasn't worked. Advice of how to get around this would be very much appreciated.
Cheers
EDIT:
Here is the error message I have in the terminal:
Exception in thread Thread-7:
Traceback (most recent call last):
File "/usr/lib/python2.7/threading.py", line 801, in __bootstrap_inner
self.run()
File "/usr/lib/python2.7/threading.py", line 754, in run
self.__target(*self.__args, **self.__kwargs)
File "/usr/lib/python2.7/multiprocessing/pool.py", line 326, in _handle_workers
pool._maintain_pool()
File "/usr/lib/python2.7/multiprocessing/pool.py", line 230, in _maintain_pool
self._repopulate_pool()
File "/usr/lib/python2.7/multiprocessing/pool.py", line 223, in _repopulate_pool
w.start()
File "/usr/lib/python2.7/multiprocessing/process.py", line 130, in start
self._popen = Popen(self)
File "/usr/lib/python2.7/multiprocessing/forking.py", line 121, in __init__
self.pid = os.fork()
OSError: [Errno 12] Cannot allocate memory

Python multithreading and multiprocessing together

Is it possible to spawn multiple processes from a single thread? Or is it a proper design to implement?
My code sample is -
def run_all_tasks(self):
for platform in self._platforms:
task_thread = threading.Thread(
target=self._run_task_list, args=(
self._get_tasks(),platform,))
taskset_threads.append(task_thread)
for taskset_thread in taskset_threads:
taskset_thread.start()
for taskset_thread in taskset_threads:
taskset_thread.join()
def _run_task_list(self, tasklist, platform):
try:
test_case_name = task.__class__.__name__
try:
test_case_name = task._get_test_case_name()
except:
test_case_name = task.__class__.__name__
pass
max_runtime = task.get_max_runtime()
manager = Manager()
self._shared_mem = manager.dict()
for task in tasklist:
task_proc = Process(
target=self.proc_setup,
args=(task, self, self._shared_mem))
task_proc.start()
task_proc.join(max_runtime)
This works however, sometimes it gives following error -
Traceback (most recent call last):
File "C:\wor\lib\TaskSet.py", line 430, in _run_task_list
if "warning" in self._shared_mem:
File "<string>", line 2, in __contains__
File "C:\Python27\lib\multiprocessing\managers.py", line 755, in _callmethod
self._connect()
File "C:\Python27\lib\multiprocessing\managers.py", line 742, in _connect
conn = self._Client(self._token.address, authkey=self._authkey)
File "C:\Python27\lib\multiprocessing\connection.py", line 167, in Client
c = PipeClient(address)
File "C:\Python27\lib\multiprocessing\connection.py", line 387, in PipeClient
win32.WaitNamedPipe(address, 1000)
WindowsError: [Error 2] The system cannot find the file specified
This can also be seen on linux platform.

python - Writing to csv file out of coroutine sink ... how to avoid closed file error?

My code (simplified):
import csv
def generate_record(downstream):
try:
while True:
incoming = (yield)
record = incoming.strip()
for worker in downstream:
worker.send(record)
except GeneratorExit:
for worker in downstream:
worker.close()
print('generate_record shutdown')
def file_writer(filename):
l = list()
try:
while True:
record = (yield)
l.append(record)
except GeneratorExit:
with open(filename, 'w', newline=''):
writer = csv.writer(f)
writer.writerows(l)
print('file_writer shutdown')
if __name__ == '__main__':
sink = file_writer('C:/Users/Some User/Downloads/data.csv')
next(sink)
worker = generate_record([sink])
next(worker)
with open('C:/Users/Some User/Downloads/Energy.txt') as f:
for line in f:
worker.send(line)
worker.close()
Generates the following error:
Traceback (most recent call last):
File "<ipython-input-43-ff97472f6399>", line 1, in <module>
runfile('C:/Users/Some User/Documents/Python Scripts/isii.py', wdir='C:/Users/Some User/Documents/Python Scripts')
File "C:\Users\Some User\Anaconda3\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 699, in runfile
execfile(filename, namespace)
File "C:\Users\Some User\Anaconda3\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 88, in execfile
exec(compile(open(filename, 'rb').read(), filename, 'exec'), namespace)
File "C:/Users/Some User/Documents/Python Scripts/isii.py", line 75, in <module>
worker.close()
File "C:/Users/Some User/Documents/Python Scripts/isii.py", line 49, in generate_record
worker.close()
File "C:/Users/Some User/Documents/Python Scripts/isii.py", line 63, in file_writer
writer.writerows(l)
ValueError: I/O operation on closed file.
What have I tried?
I've tried incrementally writing with writerow within file_writer within the try block, but that generates the same error.
The with statement in the file_writer is missing as f part; by missing that, f references the global variable f instead which is closed at the time of writing; cases the IOError.
with open(filename, 'w', newline='') as f:
^^^^
When you use with open(filename) as f:, it will do the operation you have added and then closes the file. So you don't need to use worker.close() since you are trying to close a file that already is closed.
See: What is the python keyword "with" used for?
This should be a comment, but I do not seem to have enough reputation for that.

Categories