I'm working with Orcaflex (a FEM software for offshore analysis, but should not be relevant). I created a script to check if the simulations I've performed have been completed successfully (The simulation can fail for not reaching convergence). Since I'm talking about thousands of files I was trying to parallelize the process with multiprocessing. Following, my code. Sorry but I can't produce a working example for you, but I'll try to explain in detail. I created a derived Class of multiprocessing.Process and overwrite the run() to perform the checks on the simulations files.
Then, in __main__ I set a number of processors, split the files accordingly, and start the execution.
The problem is that the processes are not spawning altogether but, in what appear to be, a random amount of time from one to another. Is this what it is supposed to be or am I missing something?
What I mean by not spawning altogether is that I see:
[Info/Worker-1] child process calling self.run()
and for example:
[Info/Worker-4] child process calling self.run()
after about 10 min of the program running.
Thanks in advance for any help/suggetsion.
import os
import subprocess
import glob
import multiprocessing
import logging
import sys
import OrcFxAPI as of
class Worker(multiprocessing.Process):
myJobs = []
def setJobs(self, jobList):
self.myJobs = jobList
#staticmethod
def changedExtensionFileName(oldFileName, newExtension):
return '.'.join((os.path.splitext(oldFileName)[0], newExtension))
def run(self):
failed = []
model = of.Model(threadCount=1)
for job in self.myJobs:
try:
print('%s starting' % job)
sys.stdout.flush()
model.LoadSimulation(job)
if model.state == of.ModelState.SimulationStoppedUnstable:
newJob = job.replace('.sim', '.dat')
failed.append(newJob)
with open('Failed_Sim.txt', 'a') as f:
f.write(f'{newJob}\n')
f.close()
model.LoadData(newJob)
model.general.ImplicitConstantTimeStep /= 2
model.SaveData(newJob)
print(f'{job} has failed, reducing time step')
except of.DLLError as err:
print('%s ERROR: %s' % (job, err))
sys.stdout.flush()
with open(self.changedExtensionFileName(job, 'FAIL'), 'w') as f:
f.write('%s error: %s' % (job, err))
f.close()
return
if __name__ == '__main__':
import re
sim_file = [f for f in os.listdir() if re.search(r'\d\d\d\d.*.sim', f)]
# begin multprocessing
multiprocessing.log_to_stderr()
logger = multiprocessing.get_logger()
logger.setLevel(logging.INFO)
corecount = 14
workers = []
chunkSize = int(len(sim_file) / corecount)
chunkRemainder = int(len(sim_file) % corecount)
print('%s jobs found, dividing across %s workers - %s each remainder %s' % (str(len(sim_file)), str(corecount), chunkSize, chunkRemainder))
start = 0
for coreNum in range(0, corecount):
worker = Worker()
workers.append(worker)
end = start + chunkSize
if chunkRemainder>0:
chunkRemainder -= 1
end += 1
if end>len(sim_file):
end = len(sim_file)
worker.setJobs(sim_file[start:end])
worker.start()
start = end
if start>=len(sim_file):
break
for worker in workers:
worker.join()
print('Done...')
OK, so no one put up their hand to answer this by minor tweak (which I don't know how to do!), so here comes the larger rejig proposal...
def worker(inpData):
#The worker process
failed1 = []
failed2 = []
for job in inpData: #I'm not sure of the data shape of the chunks, has your original method split them into coherent chunks capable of being processed independently? My step here could be wrong.
try:
#print('%s starting' % job) #Prints won't appear on console from worker processes from windows, so commented them all out
model.LoadSimulation(job)
if model.state == of.ModelState.SimulationStoppedUnstable:
newJob = job.replace('.sim', '.dat')
failed1.append(newJob)
#I'd recommend we pass the list "failed" back to master and write to text from there, otherwise you could have several processes updating the text file at once, leading to possible loss of data
#with open('Failed_Sim.txt', 'a') as f:
# f.write(f'{newJob}\n')
# f.close()
model.LoadData(newJob)
model.general.ImplicitConstantTimeStep /= 2
model.SaveData(newJob)
#print(f'{job} has failed, reducing time step')
except of.DLLError as err:
#print('%s ERROR: %s' % (job, err))
#sys.stdout.flush()
#with open(self.changedExtensionFileName(job, 'FAIL'), 'w') as f:
# f.write('%s error: %s' % (job, err))
# f.close()
failed2.append(job)
#Note I've made two failed lists to pass back, for both failure types
return failed1, failed2
if __name__ == "__main__":
import re
import multiprocessing as mp
nCPUs = mp.cpu_count()
sim_file = [f for f in os.listdir() if re.search(r'\d\d\d\d.*.sim', f)]
#Make the chunks
chunkSize = int(len(sim_file) / corecount)
chunkRemainder = int(len(sim_file) % corecount)
print('%s jobs found, dividing across %s workers - %s each remainder %s' % (str(len(sim_file)), str(corecount), chunkSize, chunkRemainder))
chunks = []
start = 0
for iChunk in range(0, nCPUs)
end = start + chunkSize
if chunkRemainder>0:
chunkRemainder -= 1
end += 1
if end>len(sim_file):
end = len(sim_file)
chunk.append(sim_file[start:end])
#Send to workers
pool = mp.Pool(processes=nCPUs)
futA = []
for iChunk in range(0, nCPUs):
futA.append(pool.apply_async(worker, args=(chunk[iChunk],))
#Gather results
if futA:
failedDat = []
failedSim = []
for iChunk in range(0, len(futA)):
resA, resB = futA[iChunk].get()
failedDat.extend(resA)
failedSim.extend(resB)
pool.close()
if failedDat:
print("Following jobs failed, reducing timesteps:")
print(failedDat)
if failedSim:
print("Following sims failed due to errors")
print(failedSim)
Related
from multiprocessing import Pool
from functools import partial
from time import sleep
import random
import string
import uuid
import os
import glob
def task_a(param1, param2, mydata):
thread_id = str(uuid.uuid4().hex) # this may not be robust enough to guarantee no collisions, address
output_filename = ''.join([str(thread_id),'.txt'])
# part 1 - create output file for task_b to use
with open(output_filename, 'w') as outfile:
for line in mydata:
outfile.write(line)
# part 2 - do some extra stuff (whilst task_b is running)
sleep(5)
print('Task A finished')
return output_filename # not interested in return val
def task_b(expected_num_files):
processed_files = 0
while processed_files<expected_num_files:
print('I am task_b, waiting for {} files ({} so far)'.format(expected_num_files, processed_files))
path_to_search = ''
for filename in glob.iglob(path_to_search + '*.txt', recursive=True):
print('Got file : {}'.format(filename))
# would do something complicated here
os.rename(filename, filename+'.done')
processed_files+=1
sleep(10)
if __name__ == '__main__':
param1 = '' # dummy variable, need to support in solution
param2 = '' # dummy variable, need to support in solution
num_workers = 2
full_data = [[random.choice(string.ascii_lowercase) for _ in range(5)] for _ in range(100)]
print(full_data)
for i in range(0, len(full_data), num_workers):
print('Going to process {}'.format(full_data[i:i+num_workers]))
p = Pool(num_workers)
task_a_func = partial(task_a, param1, param2)
results = p.map(task_a_func, full_data[i:i+num_workers])
p.close()
p.join()
task_b(expected_num_files=num_workers) # want this running sooner
print('Iteration {} complete'.format(i))
#want to wait for task_a's and task_b to finish
I'm having trouble scheduling these tasks to run concurrently.
task_a is a multiprocessing pool that produces an output file part way through it execution.
task_b MUST process the output files sequentially can be in any order (can be as soon as they are available), WHILST task_a continues to run (it will no longer change the output file)
The next iteration must only start when both all task_a's have completed AND task_b has completed.
The toy code I have posted obviously waits for task_a's to fully complete before task_b is started (which is not what I want)
I have looked at multiprocessing / subprocess etc. but cannot find a way to launch both the pool and the single task_b process concurrently AND wait for BOTH to finish.
task_b is written as if it could be changed to an external script, but I am still stuck on how manage the execution.
Should I effectively merge code from task_b into task_a and somehow pass a flag to ensure one worker per pool 'runs the task_b code' via a if/else - at least then I would just be waiting on the pool to complete?
You can use an interprocess queue to communicate the filenames between task a and task b.
Also, initializing pool repeatedly inside the loop is harmful and unnecessarily slow.
Its better to initialize the pool once in the beginning.
from multiprocessing import Pool, Manager, Event
from functools import partial
from time import sleep
import random
import string
import uuid
import os
import glob
def task_a(param1, param2, queue, mydata):
thread_id = str(uuid.uuid4().hex)
output_filename = ''.join([str(thread_id),'.txt'])
output_filename = 'data/' + output_filename
with open(output_filename, 'w') as outfile:
for line in mydata:
outfile.write(line)
print(f'{thread_id}: Task A file write complete for data {mydata}')
queue.put(output_filename)
print('Task A finished')
def task_b(queue, num_workers, data_size, event_task_b_done):
print('Task b started!')
processed_files = 0
while True:
filename = queue.get()
if filename == 'QUIT':
# Whenever you want task_b to quit, just push 'quit' to the queue
print('Task b quitting')
break
print('Got file : {}'.format(filename))
os.rename(filename, filename+'.done')
processed_files+=1
print(f'Have processed {processed_files} so far!')
if (processed_files % num_workers == 0) or (processed_files == data_size):
event_task_b_done.set()
if __name__ == '__main__':
param1 = '' # dummy variable, need to support in solution
param2 = '' # dummy variable, need to support in solution
num_workers = 2
data_size = 100
full_data = [[random.choice(string.ascii_lowercase) for _ in range(5)] for _ in range(data_size)]
mgr = Manager()
queue = mgr.Queue()
event_task_b_done = mgr.Event()
# One extra worker for task b
p = Pool(num_workers + 1)
p.apply_async(task_b, args=(queue, num_workers, data_size, event_task_b_done))
task_a_func = partial(task_a, param1, param2, queue)
for i in range(0, len(full_data), num_workers):
data = full_data[i:i+num_workers]
print('Going to process {}'.format(data))
p.map_async(task_a_func, full_data[i:i+num_workers])
print(f'Waiting for task b to process all {num_workers} files...')
event_task_b_done.wait()
event_task_b_done.clear()
print('Iteration {} complete'.format(i))
queue.put('QUIT')
p.close()
p.join()
exit(0)
How in the main thread can I track the duration of the function write_file()?
Task: create a condition, if the execution time of the function is more than 10 seconds, then it is necessary to restart the function.
from multiprocessing import Pool
def write_file(file: str):
f = open(file, 'w')
for item in range(0, 1500000):
f.write("%s\n" % item)
f.close()
if __name__ == '__main__':
list_files = ['1.txt', '2.txt', '3.txt']
with Pool(3) as p:
p.map(write_file, list_files)
I found attempt to amend Pool to be overcomplicated here.
The Pool class let workers to be alive untill the whole working queue is done and thus has complex mechanism to control it.
Instead, if you have not very stict requirenments on 10 second, you can use the following code:
from multiprocessing import Process
import time
pdict = {}
for fname in list_files:
p = Process(target = write_file, args = (fname,))
pdict[fname] = p
p.start()
while pdict:
to_del = []
time.sleep(10)
for pname in pdict:
if pdict[pname].exitcode == None or pdict[pname].is_alive():
pdict[pname].terminate() #killing old; that should also release file resource
pdict[pname] = Process(target = write_file, args = (pname,))
pdict[pname].start() #simply creating new and starting
else:
to_del.append(pname)
for pname in to_del:
del pdict[pname]
I'm trying to speed up some data processing using the multiprocessing module, the idea being I can send a chunk of data to each process I start up to utilize all the cores on my machine instead of just one at a time.
So I built an iterator for the data using the pandas read_fwf() function, with chunksize=50000 lines at a time. My problem is that eventually the iterator should raise StopIteration, and I'm trying to catch this in an except block in the child process and pass it along to the parent thread using a Queue to let the parent know it can stop spawning child processes. I have no idea what's wrong though, but what's happening is it gets to the end of the data and then keeps spawning processes which essentially do nothing.
def MyFunction(data_iterator, results_queue, Placeholder, message_queue):
try:
current_data = data_iterator.next()
#does other stuff here
#that isn't important
placeholder_result = "Eggs and Spam"
results_queue.put(placeholder_result)
return None
except StopIteration:
message_queue.put("Out Of Data")
return None
results_queue = Queue() #for passing results from each child process
message_queue = Queue() #for passing the stop iteration message
cpu_count = cpu_count() #num of cores on the machine
Data_Remaining = True #loop control
output_values = [] #list to put results in
print_num_records = 0 #used to print how many lines have been processed
my_data_file = "some_data.dat"
data_iterator = BuildDataIterator(my_data_file)
while Data_Remaining:
processes = []
for process_num in range(cpu_count):
if __name__ == "__main__":
p = Process(target=MyFunction, args=(data_iterator,results_queue,Placeholder, message_queue))
processes.append(p)
p.start()
print "Process " + str(process_num) + " Started" #print some stuff to
print_num_records = print_num_records + 50000 #show how far along
print "Processing records through: ", print_num_records #my data file I am
for i,p in enumerate(processes):
print "Joining Process " + str(i)
output_values.append(results_queue.get())
p.join(None)
if not message_queue.empty():
message = message_queue.get()
else:
message = ""
if message == "Out Of Data":
Data_Remaining = False
print "STOP ITERATION NOW PLEASE"
Update:
I discovered a problem with the data iterator. There are approximately 8 million rows in my data set, and after it processes the 8 million it never actually returns a StopIteration, it keeps returning the same 14 rows of data over and over. Here is the code that builds my data iterator:
def BuildDataIterator(my_data_file):
#data_columns is a list of 2-tuples
#headers is a list of strings
#num_lines is 50000
data_reader = read_fwf(my_data_file, colspecs=data_columns, header=None, names=headers, chunksize=num_lines)
data_iterator = data_reader.__iter__()
return data_iterator
This seems like a simple issue but I cant get my head around it.
I have a simulation which runs in a double for loop and writes the results to an HDF file. A simple version of this program is shown below:
import tables as pt
a = range(10)
b = range(5)
def Simulation():
hdf = pt.openFile('simulation.h5',mode='w')
for ii in a:
print(ii)
hdf.createGroup('/','A%s'%ii)
for i in b:
hdf.createArray('/A%s'%ii,'B%s'%i,[ii,i])
hdf.close()
return
Simulation()
This code does exactly what I want but since the process can take quite a while to run I tried to use the multiprocessing module and use the following code:
import multiprocessing
import tables as pt
a = range(10)
b = range(5)
def Simulation(ii):
hdf = pt.openFile('simulation.h5',mode='w')
print(ii)
hdf.createGroup('/','A%s'%ii)
for i in b:
hdf.createArray('/A%s'%ii,'B%s'%i,[ii,i])
hdf.close()
return
if __name__ == '__main__':
jobs = []
for ii in a:
p = multiprocessing.Process(target=Simulation, args=(ii,))
jobs.append(p)
p.start()
This however only prints the last simulation to the HDF file, somehow it overrites all the other groups.
Each time you open a file in write (w) mode, a new file is created -- so the contents of the file is lost if it already exists. Only the last file handle can successfully write to the file. Even if you changed that to append mode, you should not try to write to the same file from multiple processes -- the output will get garbled if two processes try to write at the same time.
Instead, have all the worker processes put output in a queue, and have a single dedicated process (either a subprocess or the main process) handle the output from the queue and write to the file:
import multiprocessing as mp
import tables as pt
num_arrays = 100
num_processes = mp.cpu_count()
num_simulations = 1000
sentinel = None
def Simulation(inqueue, output):
for ii in iter(inqueue.get, sentinel):
output.put(('createGroup', ('/', 'A%s' % ii)))
for i in range(num_arrays):
output.put(('createArray', ('/A%s' % ii, 'B%s' % i, [ii, i])))
def handle_output(output):
hdf = pt.openFile('simulation.h5', mode='w')
while True:
args = output.get()
if args:
method, args = args
getattr(hdf, method)(*args)
else:
break
hdf.close()
if __name__ == '__main__':
output = mp.Queue()
inqueue = mp.Queue()
jobs = []
proc = mp.Process(target=handle_output, args=(output, ))
proc.start()
for i in range(num_processes):
p = mp.Process(target=Simulation, args=(inqueue, output))
jobs.append(p)
p.start()
for i in range(num_simulations):
inqueue.put(i)
for i in range(num_processes):
# Send the sentinal to tell Simulation to end
inqueue.put(sentinel)
for p in jobs:
p.join()
output.put(None)
proc.join()
For comparison, here is a version which uses mp.Pool:
import multiprocessing as mp
import tables as pt
num_arrays = 100
num_processes = mp.cpu_count()
num_simulations = 1000
def Simulation(ii):
result = []
result.append(('createGroup', ('/', 'A%s' % ii)))
for i in range(num_arrays):
result.append(('createArray', ('/A%s' % ii, 'B%s' % i, [ii, i])))
return result
def handle_output(result):
hdf = pt.openFile('simulation.h5', mode='a')
for args in result:
method, args = args
getattr(hdf, method)(*args)
hdf.close()
if __name__ == '__main__':
# clear the file
hdf = pt.openFile('simulation.h5', mode='w')
hdf.close()
pool = mp.Pool(num_processes)
for i in range(num_simulations):
pool.apply_async(Simulation, (i, ), callback=handle_output)
pool.close()
pool.join()
It looks simpler doesn't it? However there is one signficant difference. The original code used output.put to send args to handle_output which was running in its own subprocess. handle_output would take args from the output queue and handle them immediately. With the Pool code above, Simulation accumulates a whole bunch of args in result and result is not sent to handle_output until after Simulation returns.
If Simulation takes a long time, there will be a long waiting period while nothing is being written to simulation.h5.
I have a very weird problem with the code below. when numrows = 10 the Process loops completes itself and proceeds to finish. If the growing list becomes larger it goes into a deadlock. Why is this and how can I solve this?
import multiprocessing, time, sys
# ----------------- Calculation Engine -------------------
def feed(queue, parlist):
for par in parlist:
queue.put(par)
def calc(queueIn, queueOut):
while True:
try:
par = queueIn.get(block = False)
print "Project ID: %s started. " % par
res = doCalculation(par)
queueOut.put(res)
except:
break
def write(queue, fname):
print 'Started to write to file'
fhandle = open(fname, "w")
while True:
try:
res = queue.get(block = False)
for m in res:
print >>fhandle, m
except:
break
fhandle.close()
print 'Complete writing to the file'
def doCalculation(project_ID):
numrows = 100
toFileRowList = []
for i in range(numrows):
toFileRowList.append([project_ID]*100)
print "%s %s" % (multiprocessing.current_process().name, i)
return toFileRowList
def main():
parlist = [276, 266]
nthreads = multiprocessing.cpu_count()
workerQueue = multiprocessing.Queue()
writerQueue = multiprocessing.Queue()
feedProc = multiprocessing.Process(target = feed , args = (workerQueue, parlist))
calcProc = [multiprocessing.Process(target = calc , args = (workerQueue, writerQueue)) for i in range(nthreads)]
writProc = multiprocessing.Process(target = write, args = (writerQueue, 'somefile.csv'))
feedProc.start()
feedProc.join ()
for p in calcProc:
p.start()
for p in calcProc:
p.join()
writProc.start()
writProc.join()
if __name__=='__main__':
sys.exit(main())
I think the problem is the Queue buffer getting filled, so you need to read from the queue before you can put additional stuff in it.
For example, in your feed thread you have:
queue.put(par)
If you keep putting much stuff without reading this will cause it to block untill the buffer is freed, but the problem is that you only free the buffer in your calc thread, which in turn doesn't get started before you join your blocking feed thread.
So, in order for your feed thread to finish, the buffer should be freed, but the buffer won't be freed before the thread finishes :)
Try organizing your queues access more.
The feedProc and the writeProc are not actually running in parallel with the rest of your program. When you have
proc.start()
proc.join ()
you start the process and then, on the join() you immediatly wait for it to finish. In this case there's no gain in multiprocessing, only overhead. Try to start ALL processes at once before you join them. This will also have the effect that your queues get emptied regularyl and you won't deadlock.