Proper way to share a list between processes? - python

I wanted to set up two subprocesses in which subprocess1 keeps generating data (in type of list), and subprocess2 is in charge of processing the data sent from subprocess1.
I used multiprocessing.Manager().list() to create a shared list. But this is the error it reports:
FileNotFoundError: [WinError 2]
Code
I simplified the code as below:
ps: need to run it in terminal.
import multiprocessing as mp
import random
import time
def generator(a, b, tick): # simulating data collection,and a list will be generated at random and passed to another shared list.
counter = 0
while True:
time.sleep(1)
a.append([random.uniform(1,5), random.uniform(1,5), random.uniform(1,5), random.uniform(1,5)])
counter += 1
print('generate says', a[:])
if counter%5 == 0:
b.append(a[:])
tick.value = 1 # Telling 'printer' func to print.
for _ in a:
a.remove(_)
def printer(b, tick): # simulating data processing, and only printing data received from the 'generator' func here.
while True:
time.sleep(1)
if tick.value == 1:
time.sleep(1)
print('printer says', b[:])
tick.value = 0
for _ in b:
b.remove(_)
if __name__=='__main__':
tick=mp.Value('i', 0)
a = mp.Manager().list()
b = mp.Manager().list()
p1 = mp.Process(target=generator, args=(a, b, tick))
p2 = mp.Process(target=printer, args=(b, tick))
p1.start()
p2.start()
Error
Traceback (most recent call last):
File "d:\miniconda\lib\multiprocessing\process.py", line 297, in _bootstrap
self.run()
File "d:\miniconda\lib\multiprocessing\process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "D:\Program Files (x86)\onedrive\nilm\pi\redd\niubi.py", line 9, in generater
a.append([random.uniform(1,5),random.uniform(1,5),random.uniform(1,5),random.uniform(1,5)])
File "<string>", line 2, in append
File "d:\miniconda\lib\multiprocessing\managers.py", line 792, in _callmethod
self._connect()
File "d:\miniconda\lib\multiprocessing\managers.py", line 779, in _connect
conn = self._Client(self._token.address, authkey=self._authkey)
File "d:\miniconda\lib\multiprocessing\connection.py", line 490, in Client
c = PipeClient(address)
File "d:\miniconda\lib\multiprocessing\connection.py", line 691, in PipeClient
_winapi.WaitNamedPipe(address, 1000)
FileNotFoundError: [WinError 2] The system cannot find the file specified.

There were a few things to fix, but the primary issue is that you should include Process.join, as seen below:
import multiprocessing as mp
import random
import time
... # generator and printer definitions are unchanged
if __name__=='__main__':
manager = mp.Manager() # Create an instance of the manager
a = manager.list()
b = manager.list()
tick = mp.Value('i', 0)
p1 = mp.Process(target=generator, args=(tick, a, b))
p2 = mp.Process(target=printer, args=(tick, b))
p1.start()
p2.start()
p1.join() # Join, to ensure p1 and p2 end
p2.join()

Related

"AttributeError: 'ForkAwareLocal' object has no attribute 'connection'" even with Process.join()

I'm writing a script for comparing many DNA genomes with each other, and I'm trying to use multiprocessing to have it run faster. All the processes are appending to a common list, genome_score_avgs.
This is my main process:
if __name__ == "__main__":
start = time.perf_counter()
with Manager() as manager:
genome_score_avgs = manager.list()
processes = [Process(target=compareGenomes, args=(chunk, genome_score_avgs,)) for chunk in divideGenomes('TEST_DIR')]
for p in processes:
p.start()
for p in processes:
p.join()
print(genome_score_avgs)
print(*createTimeline(genome_score_avgs), sep='\n')
print(f'Finished in {time.perf_counter() - start} seconds')
This is the error that I'm getting:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/managers.py", line 801, in _callmethod
conn = self._tls.connection
AttributeError: 'ForkAwareLocal' object has no attribute 'connection'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/ayushpal/Coding/PythonStuff/C4DInter/main.py", line 59, in <module>
print(*createTimeline(genome_score_avgs), sep='\n')
File "/Users/ayushpal/Coding/PythonStuff/C4DInter/main.py", line 42, in createTimeline
min_score = min(score_avgs, key=lambda x: x[2])
File "<string>", line 2, in __getitem__
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/managers.py", line 805, in _callmethod
self._connect()
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/managers.py", line 792, in _connect
conn = self._Client(self._token.address, authkey=self._authkey)
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/connection.py", line 507, in Client
c = SocketClient(address)
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/connection.py", line 635, in SocketClient
s.connect(address)
FileNotFoundError: [Errno 2] No such file or directory
<ListProxy object, typeid 'list' at 0x7fc04ea36bb0; '__str__()' failed>
I read in a similar Stack Overflow question that the main process is ending earlier than the other processes, which destroys the shared list, and that I should use p.join() for all the processes. This is what I'm doing , however, it's still giving the same error. What should I do?
EDIT 1:
this is the code for CompareGenomes():
def compareGenomes(genome_pairings, genome_score_avgs):
scores = []
for genome1, genome2 in genome_pairings:
print(genome1, genome2)
for i, seq in enumerate(genome1.protein_seqs):
for j, seq2 in enumerate(genome2.protein_seqs[i::]):
alignment = align.globalxx(seq, seq2)
scores.append(alignment)
top_scores = []
for i in range(len(genome1.protein_seqs)):
top_scores.append(max(scores, key=lambda x: x[0][2] / len(x[0][1])))
scores.remove(max(scores, key=lambda x: x[0][2] / len(x[0][1])))
avg_score = sum([i[0][2] / len(i[0][1]) for i in top_scores]) / len(top_scores)
with open(f'alignments/{genome1.name}x{genome2.name}.txt', 'a') as file:
file.writelines([format_alignment(*i[0]) for i in top_scores])
genome_score_avgs.append((genome1, genome2, avg_score))
The error is happening because you are using the managed list after you have closed the manager. Once that happens, the process that the manager spawns is closed as well, and therefore your managed list will no longer work. You need to use the list inside the with block like below:
if __name__ == "__main__":
start = time.perf_counter()
with Manager() as manager:
genome_score_avgs = manager.list()
processes = [Process(target=compareGenomes, args=(chunk, genome_score_avgs,)) for chunk in divideGenomes('TEST_DIR')]
for p in processes:
p.start()
for p in processes:
p.join()
print(genome_score_avgs)
print(*createTimeline(genome_score_avgs), sep='\n')
print(f'Finished in {time.perf_counter() - start} seconds')

How to recover the return value of a function passed to multiprocessing.Process?

I have looked at this question to get started and it works just fine How can I recover the return value of a function passed to multiprocessing.Process?
But in my case I would like to write a small tool, that would connect to many computers and gather some statistics, each stat would be gathered within a Process to make it snappy. But as soon as I try to wrap up the multiprocessing command in a class for a machine then it fails.
Here is my code
import multiprocessing
import pprint
def run_task(command):
p = subprocess.Popen(command, stdout = subprocess.PIPE, universal_newlines = True, shell = False)
result = p.communicate()[0]
return result
MACHINE_NAME = "cptr_name"
A_STAT = "some_stats_A"
B_STAT = "some_stats_B"
class MachineStatsGatherer():
def __init__(self, machineName):
self.machineName = machineName
manager = multiprocessing.Manager()
self.localStats = manager.dict() # creating a shared ressource for the sub processes to use
self.localStats[MACHINE_NAME] = machineName
def gatherStats(self):
self.runInParallel(
self.GatherSomeStatsA,
self.GatherSomeStatsB,
)
self.printStats()
def printStats(self):
pprint.pprint(self.localStats)
def runInParallel(self, *fns):
processes = []
for fn in fns:
process = multiprocessing.Process(target=fn, args=(self.localStats))
processes.append(process)
process.start()
for process in processes:
process.join()
def GatherSomeStatsA(self, returnStats):
# do some remote command, simplified here for the sake of debugging
result = "Windows"
returnStats[A_STAT] = result.find("Windows") != -1
def GatherSomeStatsB(self, returnStats):
# do some remote command, simplified here for the sake of debugging
result = "Windows"
returnStats[B_STAT] = result.find("Windows") != -1
def main():
machine = MachineStatsGatherer("SOMEMACHINENAME")
machine.gatherStats()
return
if __name__ == '__main__':
main()
And here is the error message
Traceback (most recent call last):
File "C:\Users\mesirard\AppData\Local\Programs\Python\Python37\lib\multiprocessing\process.py", line 297, in _bootstrap
self.run()
File "C:\Users\mesirard\AppData\Local\Programs\Python\Python37\lib\multiprocessing\process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "d:\workdir\trunks6\Tools\VTKAppTester\Utils\NXMachineMonitorShared.py", line 45, in GatherSomeStatsA
returnStats[A_STAT] = result.find("Windows") != -1
TypeError: 'str' object does not support item assignment
Process Process-3:
Traceback (most recent call last):
File "C:\Users\mesirard\AppData\Local\Programs\Python\Python37\lib\multiprocessing\process.py", line 297, in _bootstrap
self.run()
File "C:\Users\mesirard\AppData\Local\Programs\Python\Python37\lib\multiprocessing\process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "d:\workdir\trunks6\Tools\VTKAppTester\Utils\NXMachineMonitorShared.py", line 50, in GatherSomeStatsB
returnStats[B_STAT] = result.find("Windows") != -1
TypeError: 'str' object does not support item assignment
The issue is coming from this line
process = multiprocessing.Process(target=fn, args=(self.localStats))
it should have a extra comma at the end of args like so
process = multiprocessing.Process(target=fn, args=(self.localStats,))

too many files open error with multiprocessing

I have a code that uses multiprocessing over about 10000 files on a 12 core vcpu on Ubuntu.
def process_file(name):
inp = open(name)
out = open(name.split('.')[0]+'wikiout.txt','a')
for row in inp:
row = row.strip()
sent_text = nltk.sent_tokenize(text)
for sent in sent_text:
# process sentence
inp.close()
out.close()
if __name__ == '__main__':
processes = []
for i in 'ABCDEF':
for j in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ':
for k in range(100)
filename = os.path.join(os.path.dirname(__file__), (i + j + '/' + 'wiki_' + str(k) + '.txt'))
p = multiprocessing.Process(target=process_file, args=(filename,))
processes.append(p)
p.start()
for process in processes:
process.join()
For some reason I get this issue
File "wikirules.py", line 37, in <module>
p.start()
File "/usr/lib/python3.8/multiprocessing/process.py", line 121, in start
self._popen = self._Popen(self)
File "/usr/lib/python3.8/multiprocessing/context.py", line 224, in _Popen
return _default_context.get_context().Process._Popen(process_obj)
File "/usr/lib/python3.8/multiprocessing/context.py", line 277, in _Popen
return Popen(process_obj)
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 19, in __init__
self._launch(process_obj)
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 69, in _launch
child_r, parent_w = os.pipe()
OSError: [Errno 24] Too many open files
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
File "wikirules.py", line 13, in process_file
File "/usr/local/lib/python3.8/dist-packages/nltk/tokenize/__init__.py", line 106, in sent_tokenize
File "/usr/local/lib/python3.8/dist-packages/nltk/data.py", line 752, in load
File "/usr/local/lib/python3.8/dist-packages/nltk/data.py", line 877, in _open
File "/usr/local/lib/python3.8/dist-packages/nltk/data.py", line 327, in open
OSError: [Errno 24] Too many open files: '/root/nltk_data/tokenizers/punkt/PY3/english.pickle'
Any clue why this might be happening? Im still new to multiprocessing. So shouldn't this not open more than 12 files at once.
Your code is trying to run
len('ABCDEF') * len('ABCD...Z') * len(range(100)) = 6 * 26 * 100 = 15 600
operating system processes simultaneously.
Actually multiprocessing module contains relatively low level primitives to work with multiprocessing, and for basic tasks standard library suggests more safe and convenient option - module concurrent.futures which contains Pools implementations for threads and processes, and could be very useful especially for "embarrassingly parallel" workloads.
Here is example how the code from your question could be transformed using concurrent.futures and some other python features like generators, context managers and pathlib module.
import concurrent.futures as futures
import itertools
import pathlib
import nltk
BASE_PATH = pathlib.Path(__file__).parent.absolute()
def filename_generator():
"""produce filenames sequence"""
for i, j, k in itertools.product("ABCDEF", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", range(100)):
yield BASE_PATH / f"{i}{j}/wiki_{k}.txt"
def worker(filename: pathlib.Path):
"""do all the job"""
out_filename = filename.with_suffix('.wikiout.txt')
with open(filename) as inp, open(out_filename, "a") as out:
for row in inp:
text = row.strip()
sent_text = nltk.sent_tokenize(text)
for sent in sent_text:
"""process sentence"""
def main():
with futures.ProcessPoolExecutor() as pool:
# mapping future->filename, useful in case of error
task_to_filename = {pool.submit(worker, f): f for f in filename_generator()}
for f in futures.as_completed(task_to_filename):
try:
f.result()
except Exception as e:
filename = task_to_filename[f]
print(f"{filename} processing failed: {e}")
if __name__ == "__main__":
main()

Python multiprocessing Deadlock using Queue

I have a python program like below.
from multiprocessing import Lock, Process, Queue, current_process
import time
lock = Lock()
def do_job(tasks_to_accomplish, tasks_that_are_done):
while not tasks_to_accomplish.empty():
task = tasks_to_accomplish.get()
print(task)
lock.acquire()
tasks_that_are_done.put(task + ' is done by ' + current_process().name)
lock.release()
time.sleep(1)
return True
def main():
number_of_task = 10
number_of_processes = 4
tasks_to_accomplish = Queue()
tasks_that_are_done = Queue()
processes = []
for i in range(number_of_task):
tasks_to_accomplish.put("Task no " + str(i))
# creating processes
for w in range(number_of_processes):
p = Process(target=do_job, args=(tasks_to_accomplish, tasks_that_are_done))
processes.append(p)
p.start()
# completing process
for p in processes:
p.join()
# print the output
while not tasks_that_are_done.empty():
print(tasks_that_are_done.get())
return True
if __name__ == '__main__':
main()
Sometimes program run perfectly but sometimes it gets stuck and doesn't complete. When quit manually, it produces following error.
$ python3 multiprocessing_example.py
Task no 0
Task no 1
Task no 2
Task no 3
Task no 4
Task no 5
Task no 6
Task no 7
Task no 8
Task no 9
^CProcess Process-1:
Traceback (most recent call last):
File "multiprocessing_example.py", line 47, in <module>
main()
File "multiprocessing_example.py", line 37, in main
p.join()
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 121, in join
res = self._popen.wait(timeout)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/popen_fork.py", line 51, in wait
return self.poll(os.WNOHANG if timeout == 0.0 else 0)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/popen_fork.py", line 29, in poll
pid, sts = os.waitpid(self.pid, flag)
KeyboardInterrupt
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 249, in _bootstrap
self.run()
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "multiprocessing_example.py", line 9, in do_job
task = tasks_to_accomplish.get()
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/queues.py", line 94, in get
res = self._recv_bytes()
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/connection.py", line 407, in _recv_bytes
buf = self._recv(4)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
KeyboardInterrupt
Can someone tell me what is the issue with the program? I am using python 3.6.
Note: Lock is not needed around a Queue.
lock.acquire()
tasks_that_are_done.put(task + ' is done by ' + current_process().name)
lock.release()
Queue
The Queue class in this module implements all the required locking semantics.
Question: ... what is the issue with the program?
You are using Queue.empty() and Queue.get(),
such leads to Deadlock on calling join() because there is no guarantee that the empty() State don't change until get()
was reaching.
Deadlock prone:
while not tasks_to_accomplish.empty():
task = tasks_to_accomplish.get()
Instead of using empty/get, Pair use for instance:
import queue
while True:
try:
task = tasks_to_accomplish.get_nowait()
except queue.Empty:
break
else:
# Handle task here
...
tasks_to_accomplish.task_done()

Python Threading and Multiprocessing in the same code causing pickling error

While using Python3 on Windows 7 to process some large csv files I have run into a issue with the program not running fast enough. The original working version of the code is similar to below, but the process calls are both threads. Upon adding the multiprocessing library and transferring the tdg.Thread to the mp.Process as it shows below I receive this pickling error:
line 70, in <module>
proc1.start()
File "C:\Python34\lib\multiprocessing\process.py", line 105, in start
self._popen = self._Popen(self)
File "C:\Python34\lib\multiprocessing\context.py", line 212, in _Popen
return _default_context.get_context().Process._Popen(process_obj)
File "C:\Python34\lib\multiprocessing\context.py", line 313, in _Popen
return Popen(process_obj)
File "C:\Python34\lib\multiprocessing\popen_spawn_win32.py", line 66, in __init__
reduction.dump(process_obj, to_child)
File "C:\Python34\lib\multiprocessing\reduction.py", line 59, in dump
ForkingPickler(file, protocol).dump(obj)
_pickle.PicklingError: Can't pickle <class '_thread.lock'>: attribute lookup lock on _thread failed
Code:
import multiprocessing as mp
import threading as tdg
import queue as q
def my_p1func1(data, Q):
#performs LDAP for data set 1
print("p1f1:",data)
Q.put(data)
def my_p1func2(data, Q):
#performs LDAP for data set2
print("p1f2:",data)
Q.put(data)
def my_proc1(data, Q):
f1_Q = q.Queue()
f2_Q = q.Queue()
f1 = tdg.Thread(target=myP1Func1, args = (data['1'], f1_Q))
f2 = tdg.Thread(target=myP1Func2, args = (data['2'], f2_Q))
f1.start()
f2.start()
f1.join()
f2.join()
f1_out=f1_Q.get()
f2_out=f2_Q.get()
Q.put({'f1':f1_out,'f2':f2_out})
def my_p2func1(data, Q):
#perform gethostbyaddr() for data set 1
print("p2f1:",data)
Q.put(data)
def my_p2func2(data, Q):
#perform gethostbyaddr() for data set 2
print("p2f2:",data)
Q.put(data)
def my_proc2(data, Q):
f1_Q = q.Queue()
f2_Q = q.Queue()
f1 = tdg.Thread(target=myP2Func1, args = (data['1'], f1_Q))
f2 = tdg.Thread(target=myP2Func2, args = (data['2'], f2_Q))
f1.start()
f2.start()
f1.join()
f2.join()
f1_out=f1_Q.get()
f2_out=f2_Q.get()
Q.put({'f1':f1_out,'f2':f2_out})
dataIn = {'1': [1,2,3], '2': ['a','b','c']}
pq1 = q.Queue()
pq2 = q.Queue()
proc1 = mp.Process(target=my_proc1, args=(dataIn, pq1))
proc2 = mp.Process(target=my_proc2, args=(dataIn,pq2))
proc1.start()
proc2.start()
proc1.join()
proc2.join()
p1 = pq1.get()
p2 = pq2.get()
print(p1)
print(p2)
I though the issues was being caused by Locks I had around my print statements, but even after removing them it continues to throw the same pickling error.
I am in over my head with this and would appreciate any help understanding why it is attempting to pickle something not in use and how do I get this running so that it is more efficient?
You can't use a regular Queue.Queue object with multiprocessing. You have to use a multiprocessing.Queue. The standard Queue.Queue won't be shared between the processes, even if you were to make it picklable. It's an easy fix, though:
if __name__ == "__main__":
dataIn = {'1': [1,2,3], '2': ['a','b','c']}
pq1 = mp.Queue()
pq2 = mp.Queue()
proc1 = mp.Process(target=my_proc1, args=(dataIn, pq1))
proc2 = mp.Process(target=my_proc2, args=(dataIn, pq2))
proc1.start()
proc2.start()
proc1.join()
proc2.join()
p1 = pq1.get()
p2 = pq2.get()

Categories