I have some questions related to setting the maximum running time of a function in Python. In fact, I would like to use pdfminer to convert the .pdf files to .txt.
The problem is that very often, some files are not possible to decode and take extremely long time. So I want to set threading.Timer() to limit the conversion time for each file to 5 seconds. In addition, I run under windows so I cannot use the signal module for this.
I succeeded in running the conversion code with pdfminer.convert_pdf_to_txt() (in my code it is "c"), but I am not sure that the in the following code, threading.Timer() works. (I don't think it properly constrains the time for each processing)
In summary, I want to:
Convert the pdf to txt
Time limit for each conversion is 5 sec, if it runs out of time, throw an exception and save an empty file
Save all the txt files under the same folder
If there are any exceptions/errors, still save the file but with empty content.
Here is the current code:
import converter as c
import os
import timeit
import time
import threading
import thread
yourpath = 'D:/hh/'
def iftimesout():
print("no")
with open("D:/f/"+g+"&"+t+"&"+name+".txt", mode="w") as newfile:
newfile.write("")
for root, dirs, files in os.walk(yourpath, topdown=False):
for name in files:
try:
timer = threading.Timer(5.0,iftimesout)
timer.start()
t=os.path.split(os.path.dirname(os.path.join(root, name)))[1]
a=str(os.path.split(os.path.dirname(os.path.join(root, name)))[0])
g=str(a.split("\\")[1])
with open("D:/f/"+g+"&"+t+"&"+name+".txt", mode="w") as newfile:
newfile.write(c.convert_pdf_to_txt(os.path.join(root, name)))
print("yes")
timer.cancel()
except KeyboardInterrupt:
raise
except:
for name in files:
t=os.path.split(os.path.dirname(os.path.join(root, name)))[1]
a=str(os.path.split(os.path.dirname(os.path.join(root, name)))[0])
g=str(a.split("\\")[1])
with open("D:/f/"+g+"&"+t+"&"+name+".txt", mode="w") as newfile:
newfile.write("")
I finally figured it out!
First of all, define a function to call another function with a limited timeout:
import multiprocessing
def call_timeout(timeout, func, args=(), kwargs={}):
if type(timeout) not in [int, float] or timeout <= 0.0:
print("Invalid timeout!")
elif not callable(func):
print("{} is not callable!".format(type(func)))
else:
p = multiprocessing.Process(target=func, args=args, kwargs=kwargs)
p.start()
p.join(timeout)
if p.is_alive():
p.terminate()
return False
else:
return True
What does the function do?
Check timeout and function to be valid
Start the given function in a new process, which has some advantages over threads
Block the program for x seconds (p.join()) and allow the function to be executed in this time
After the timeout expires, check if the function is still running
Yes: Terminate it and return False
No: Fine, no timeout! Return True
We can test it with time.sleep():
import time
finished = call_timeout(2, time.sleep, args=(1, ))
if finished:
print("No timeout")
else:
print("Timeout")
We run a function which needs one second to finish, timeout is set to two seconds:
No timeout
If we run time.sleep(10) and set the timeout to two seconds:
finished = call_timeout(2, time.sleep, args=(10, ))
Result:
Timeout
Notice the program stops after two seconds without finishing the called function.
Your final code will look like this:
import converter as c
import os
import timeit
import time
import multiprocessing
yourpath = 'D:/hh/'
def call_timeout(timeout, func, args=(), kwargs={}):
if type(timeout) not in [int, float] or timeout <= 0.0:
print("Invalid timeout!")
elif not callable(func):
print("{} is not callable!".format(type(func)))
else:
p = multiprocessing.Process(target=func, args=args, kwargs=kwargs)
p.start()
p.join(timeout)
if p.is_alive():
p.terminate()
return False
else:
return True
def convert(root, name, g, t):
with open("D:/f/"+g+"&"+t+"&"+name+".txt", mode="w") as newfile:
newfile.write(c.convert_pdf_to_txt(os.path.join(root, name)))
for root, dirs, files in os.walk(yourpath, topdown=False):
for name in files:
try:
t=os.path.split(os.path.dirname(os.path.join(root, name)))[1]
a=str(os.path.split(os.path.dirname(os.path.join(root, name)))[0])
g=str(a.split("\\")[1])
finished = call_timeout(5, convert, args=(root, name, g, t))
if finished:
print("yes")
else:
print("no")
with open("D:/f/"+g+"&"+t+"&"+name+".txt", mode="w") as newfile:
newfile.write("")
except KeyboardInterrupt:
raise
except:
for name in files:
t=os.path.split(os.path.dirname(os.path.join(root, name)))[1]
a=str(os.path.split(os.path.dirname(os.path.join(root, name)))[0])
g=str(a.split("\\")[1])
with open("D:/f/"+g+"&"+t+"&"+name+".txt", mode="w") as newfile:
newfile.write("")
The code should be easy to understand, if not, feel free to ask.
I really hope this helps (as it took some time for us to get it right ;))!
Check following code and let me know in case of any issues. Also let me know whether you still want to use force termination feature (KeyboardInterruption)
path_to_pdf = "C:\\Path\\To\\Main\\PDFs" # No "\\" at the end of path!
path_to_text = "C:\\Path\\To\\Save\\Text\\" # There is "\\" at the end of path!
TIMEOUT = 5 # seconds
TIME_TO_CHECK = 1 # seconds
# Save PDF content into text file or save empty file in case of conversion timeout
def convert(path_to, my_pdf):
my_txt = text_file_name(my_pdf)
with open(my_txt, "w") as my_text_file:
try:
my_text_file.write(convert_pdf_to_txt(path_to + '\\' + my_pdf))
except:
print "Error. %s file wasn't converted" % my_pdf
# Convert file_name.pdf from PDF folder to file_name.text in Text folder
def text_file_name(pdf_file):
return path_to_text + (pdf_file.split('.')[0]+ ".txt")
if __name__ == "__main__":
# for each pdf file in PDF folder
for root, dirs, files in os.walk(path_to_pdf, topdown=False):
for my_file in files:
count = 0
p = Process(target=convert, args=(root, my_file,))
p.start()
# some delay to be sure that text file created
while not os.path.isfile(text_file_name(my_file)):
time.sleep(0.001)
while True:
# if not run out of $TIMEOUT and file still empty: wait for $TIME_TO_CHECK,
# else: close file and start new iteration
if count < TIMEOUT and os.stat(text_file_name(my_file)).st_size == 0:
count += TIME_TO_CHECK
time.sleep(TIME_TO_CHECK)
else:
p.terminate()
break
Related
I'm trying to make a watchdog to listen to a folder changes (adding/deleting) files.
My problem is, that every time I copy-create/delete several files from this folder (and its subfolders), the event chain starts one by one for each and every file.
How can I make the on_event() method to be invoked only once, after multiple files creation/deletion?
Let's say I'm copying to this folders two images.
I want the event handler to be invoked only once after file transfer finishes, and not twice - once for each image - as it currently works.
Thanks!
The code runs on a raspberry pi 3 with python 3.7.
Here's the code:
import os
import time
import psutil
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
i = 0
def show_stats():
global i
read = "read #" + str(i) + ":"
mem = "\nmemory in use: " + str(psutil.virtual_memory().percent)+"%"
cpu = "\ncpu load: " + str(psutil.cpu_percent())+"%"
temp = "\ncurrent " + \
os.popen("vcgencmd measure_temp").readline().replace(
"=", ": ").replace("'C", " C°")
end = "\n=================="
i += 1
stats = read + mem + cpu + temp + end
return stats
class Watcher:
DIRECTORY_TO_WATCH = r'/home/pi/Desktop/jsSlider/images'
def __init__(self):
self.observer = Observer()
print("watching ", self.DIRECTORY_TO_WATCH, "...")
def run(self):
event_handler = Handler()
self.observer.schedule(
event_handler, self.DIRECTORY_TO_WATCH, recursive=True)
self.observer.start()
try:
while True:
time.sleep(5)
print(show_stats())
except Exception as e:
self.observer.stop()
print(e)
self.observer.join()
class Handler(FileSystemEventHandler):
#staticmethod
def on_event(event):
wait = 1
elif event.event_type == 'created' or event.event_type == 'deleted':
print("Received event - %s. " %event.src_path, str(event.event_type))
time.sleep(wait) #i found that its best to give some timeout between commands because it overwhelmed the pi for some reason (one second seems to be enough)...
os.system('python /home/pi/Desktop/Slider/scripts/arr_edit.py') #recreate the JS array
time.sleep(wait)
os.system('cp -r /home/pi/Desktop/jsSlider/scripts/imgArr.js /home/pi/Desktop/jsSlider/themes/1') #copy the newly created JS array to its place
time.sleep(wait)
os.system('sudo pkill chromium') #"refresh" the page -the kiosk mode reactivates the process...
# os.system('cls')
print('done!')
if __name__ == '__main__':
w = Watcher()
w.run()
Edit I
There is a poor rpi3 connected to a tv in some clinic, working in kiosk mode to display images from a local html file (with some js code - the slide show run with an existing JS script - i can upload everything if requested | the images are also on the pi itself).
What I'm trying to achieve is to automatically:
rebulid the JS array (with a working python script - code below (arr_edit.py)).
copy the new array to its desired location. (shell command)
and restart chromium with "pkill chromium". (shell command)
Now, I cannot allow that every time someone copies/deletes multiple images, the commands will run each time - which means:
whenever 2+ images are being added, i cannot "restart" the kiosk
(sudo pkill chromium) each and every time a file is created.
Every time you copy multiple files (images in that case), for each individual image that was created in the folder, an entirely individual event.created is invoked, therefore for 5 images, there will be 5 different event.created events that will fire the on_event() method each on its own turn, making the kiosk restart 5 times in a row. (now think of what will happen if a 50 files transfer occurs - the pi will just crash)
Therefore, I need a method to invoke the command only 1 time after file transfer finishes, regardless of how many files has changed/created/deleted in the folder.
arr_edit.py (not entirely my code):
import os
dir_path = r'/home/pi/Desktop/jsSlider/images'
file_path = r'/home/pi/Desktop/jsSlider/scripts/imgArr.js'
directory = os.fsencode(dir_path)
arr_name = 'images=[\n'
start_str = '{"img":"./images/'
end_str = '"},\n'
images = ''
def writer(array, imagesList):
str_to_write = array + imagesList + ']'
f = open(file_path, 'w')
f.write(str_to_write)
f.close
file_list = os.listdir(directory)
for file in file_list:
filename = os.fsdecode(file)
if filename.endswith(".jpg") or filename.endswith(".jpeg") or filename.endswith(".webp") or filename.endswith(".webp"):
if file == file_list[len(file_list)-1]:
end_str = '"}\n'
images += start_str + filename + end_str
continue
else:
continue
writer(arr_name, images)
output JS array (sample from inside imgArr.js):
images=[
{"img":"./images/246.jpg"},
{"img":"./images/128.jpg"},
{"img":"./images/238.webp"},
{"img":"./images/198.jpg"},
{"img":"./images/247.webp"}
]
As Mark suggested in the comments,
i added a check to see if the js file has changed in the past 5 minutes.
if the file changed,
wait for another 5 minutes and re-initiate the cange (if more files have been added to the folder) so the new, larger files will also be shown in this run.
Works like a charm!
many thanks!!
here's the final watchdog.py
import os
import time
import psutil
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
i = 0
def show_stats():
global i
read = "read #" + str(i) + ":"
mem = "\nmemory in use: " + str(psutil.virtual_memory().percent)+"%"
cpu = "\ncpu load: " + str(psutil.cpu_percent())+"%"
temp = "\ncurrent " + \
os.popen("vcgencmd measure_temp").readline().replace(
"=", ": ").replace("'C", " C°")
end = "\n=================="
i += 1
stats = read + mem + cpu + temp + end
return stats
def wait_for_file(file):
time.sleep(300)
if age(file) >= 5:
modify()
def modify():
os.system('python /home/pi/Desktop/jsSlider/scripts/arr_edit.py')
os.system(
'cp -r /home/pi/Desktop/jsSlider/scripts/imgArr.js /home/pi/Desktop/jsSlider/themes/1')
time.sleep(1)
os.system('sudo pkill chromium')
# os.system('cls')
print("done!\nwatching...")
def age(filename):
return ((time.time() - os.path.getmtime(filename))//60)
class Watcher:
DIRECTORY_TO_WATCH = r'/home/pi/Desktop/jsSlider/images'
def __init__(self):
self.observer = Observer()
print("watching ", self.DIRECTORY_TO_WATCH, "...")
def run(self):
event_handler = Handler()
self.observer.schedule(
event_handler, self.DIRECTORY_TO_WATCH, recursive=True)
self.observer.start()
try:
while True:
time.sleep(5)
print(show_stats())
except Exception as e:
self.observer.stop()
print(e)
self.observer.join()
class Handler(FileSystemEventHandler):
# staticmethod
def on_any_event(event):
file = r'/home/pi/Desktop/jsSlider/scripts/imgArr.js'
if event.event_type == 'created' or event.event_type == 'deleted':
print("Received event - %s. " %
event.src_path, str(event.event_type))
time.sleep(5)
if age(file) < 5:
wait_for_file(file)
else:
modify()
if __name__ == '__main__':
w = Watcher()
w.run()
I have a script that executes a compiled fortran module. Input then has to be passed to this process in the form of a filename and enter must be pressed to initiate processing. I have no real control over the nature of the fortran executable it is what it is.
I am using subprocess and communicate to handle this from python and it works well. Problem is I need to process 100's to 1000's of files and doing them sequentially is slow. While I expect I will eventually run into an I/O bottleneck at the HDD current, execution times are nowhere near this limit.
I attempted to simply wrap the method spawning the subproccess in a multithreading ThreadPoolExecutor but found that only a small subset of the files actually get processed (roughly every 20, but it varies) and the rest of the files are created but are empty (each is 0 kb and has no contents - as though the subprocess that spawned them was killed prematurely just after creating the handle)
I have tried using instead subprocess.run with an input argument, custom os.pipes, TemporaryFile as a pipe, spawning all the subprocesses first then multithreading calls to communicate, and manual delays after spawning the process before communicating, all to no avail.
If I spawn the subprocesses first I can confirm by inspection that the stdout, stdin, and stderr pipe for each has a unique identifier.
This is the code that calls the fortran module
def run_CEA2(fName_prefix):
print(fName_prefix)
CEA_call = subprocess.run('FCEA2.exe', input='{}\n'.format(fName_prefix), encoding='ascii',
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
shell=True, cwd=None, check=False)
if 'DOES NOT EXIST' in CEA_call.stdout:
raise RuntimeError('\nERROR: Stdout returned by run_CEA()\n'+'\t'.join([line+'\n' for line in CEA_call.stdout.split('\n')]))
else:
return True
This is the code that calls the above method asynchronously
import concurrent.futures
def threadedRun(fName):
print('\tExecuting file {}'.format(fName))
run_CEA(fName)
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
executor.map(threadedRun, fNames)
print('\tDone.')
Here is a version of run_CEA using Popen and communicate
def run_CEA(fName_prefix):
print(fName_prefix)
p = subprocess.Popen(['FCEA2.exe'], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE,shell=True)
return_str = p.communicate(input=('{}\n'.format(fName_prefix)).encode())[0].decode()
if 'DOES NOT EXIST' in return_str:
raise RuntimeError('\nERROR: Stdout returned by run_CEA()\n'+'\t'.join([line+'\n' for line in return_str.split('\n')]))
else:
return True
I do not understand what it causing the premature closure of spawned processes. As stated above I can pre-spawn all the sub processes and then iterate through a list and of these and process each in turn.
When adding concurrent futures to the mix it seems signals get crossed and multiple spawned processes are killed at a time.
Interestingly when I used concurrent futures only to process the pre-populated list of subprocesses behaviour was the same. Regardless of all processes already being present (not being spawned on the fly as the communicate and close process was occuring) output was produced for roughly every 20th process in the list.
Embarrassingly the issue was a Fortran issue and became obvious when I stopped piping stderr and allowed it to pass to the console where I was greeted by:
forrtl: severe (30): / process cannot access file because it is being
used by another process.
The Fortran executable being used was not just reading from a binary but also locking it with write permissions meaning that it could not be called concurrently by more than one instance of the executable.
To get around this at runtime I spawn n temporary folders each with a complete copy of the Fortran executable and its dependencies. Then use the 'cwd' argument in the call to subprocess run to have a bunch of threads and crunch through the files.
If you are familiar with the NASA CEA code that is what is being called. For completeness below is code for anyone that might benefit.
import os
import shutil
import subprocess
from threading import Thread, Lock, current_thread
import queue
import functools
import threading
def run_CEA(fName_prefix,working_folder=None):
CEA_str = os.path.abspath(os.path.join(working_folder,'FCEA2.exe'))
CEA_call = subprocess.run(CEA_str, input='{}\n'.format(fName_prefix),
encoding='ascii', stdout=subprocess.PIPE, stderr=subprocess.PIPE,
shell=False, cwd=working_folder, check=False)
if 'DOES NOT EXIST' in CEA_call.stdout:
raise RuntimeError('FCEA2.exe could not find specified input file\n'
+'\t'.join([line+'\n' for line in CEA_call.stdout.split('\n')]))
elif CEA_call.stderr:
raise RuntimeError('Error occured in call to FCEA2.exe\n'
+'\t'.join([line+'\n' for line in CEA_call.stderr.split('\n')]))
else:
return 1
def synchronized(lock):
""" Synchronization decorator """
def wrap(f):
#functools.wraps(f)
def newFunction(*args, **kw):
with lock:
return f(*args, **kw)
return newFunction
return wrap
class CEA_Queue(queue.Queue):
""" Based on template at provided by Shashwat Kumar found #
https://medium.com/#shashwat_ds/a-tiny-multi-threaded-job-queue-in-30-lines-of-python-a344c3f3f7f0"""
inp_folder = os.path.abspath('.//inp_files')
out_folder = os.path.abspath('.//out_files')
run_folder = os.path.abspath('.//workers')
exe_folder = os.path.abspath('.//cea_files')
req_cea_files = ["FCEA2.exe",
"b1b2b3.exe",
"syntax.exe",
"thermo.lib",
"trans.lib"]
lock = Lock()
#classmethod
def test_dirs_cls(cls):
print('test_dirs_cls:')
for dirname in ('inp_folder','out_folder','run_folder','exe_folder'):
print(dirname,':',getattr(cls,dirname))
def test_dirs_self(self):
print('test_dirs_self:')
for dirname in ('inp_folder','out_folder','run_folder','exe_folder'):
print(dirname,':',getattr(self,dirname))
#staticmethod
def clean_folder(target,ignore_list=[]):
if os.path.isdir(target):
for fName in os.listdir(target):
fPath = os.path.join(target,fName)
if os.path.isfile(fPath) and not fName in ignore_list:
os.remove(fPath)
elif os.path.isdir(fPath) and not fName in ignore_list:
shutil.rmtree(fPath)
#classmethod
def setup_folders(cls):
for folder in (cls.out_folder,cls.inp_folder,cls.run_folder):
if not os.path.isdir(folder):
os.mkdir(folder)
else:
cls.clean_folder(folder)
if not os.path.isdir(cls.exe_folder):
raise ValueError("Cannot find exe folder at:\n\t{}".format(cls.exe_folder))
else:
cls.clean_folder(cls.exe_folder,ignore_list=cls.req_cea_files)
#classmethod
def cleanup(cls):
cls.clean_folder(cls.run_folder)
out_files = []
for fName in os.listdir(cls.inp_folder):
if '.out' == fName[-4:]:
try:
shutil.move(os.path.join(cls.inp_folder,fName),
os.path.join(cls.out_folder,fName))
out_files.append(os.path.join(cls.out_folder,fName))
except Exception as exc:
print('WARNING: Could not move *.out file\n{}\n{}'.format(fName,exc))
return out_files
#classmethod
def gather_inputs(cls):
inp_files = []
for fName in os.listdir(cls.inp_folder):
if '.inp' in fName[-4:]:
inp_files.append(os.path.join(cls.inp_folder,fName))
return inp_files
#classmethod
def set_dirs(cls,inp_folder=None,out_folder=None,
run_folder=None,exe_folder=None):
if not inp_folder is None:
cls.inp_folder = os.path.abspath(inp_folder)
if not out_folder is None:
cls.out_folder = os.path.abspath(out_folder)
if not run_folder is None:
cls.run_folder = os.path.abspath(run_folder)
if not exe_folder is None:
cls.exe_folder = os.path.abspath(exe_folder)
def __init__(self, num_workers=1,inp_folder=None,out_folder=None,
run_folder=None,exe_folder=None):
queue.Queue.__init__(self)
self.set_dirs(inp_folder,out_folder,run_folder,exe_folder)
self.setup_folders()
self.num_workers = num_workers
self.n_task = 0
self.n_complete = 0
self.update_every = 10.
self.last_update = 0
def add_task(self, fName):
self.put(fName)
def schedule_tasks(self):
inp_files = self.gather_inputs()
for fName in inp_files:
self.add_task(fName.split('.inp')[0])
self.n_task = len(inp_files)
self.n_complete = 0
self.last_update = 0
return inp_files
def progress(self):
return (self.n_complete/self.n_task)*100
def start_workers(self):
self.worker_threads = []
for i in range(self.num_workers):
k = str(i)
worker_folder = os.path.join(self.run_folder,k)
try:
os.mkdir(worker_folder)
for fNameExe in os.listdir(self.exe_folder):
shutil.copy(os.path.join(self.exe_folder,fNameExe),os.path.join(worker_folder,fNameExe))
except Exception as exc:
raise exc
t = Thread(target=self.worker)
t.daemon = True
t.worker_folder = worker_folder
t.start()
self.worker_threads.append(t)
def worker(self):
while True:
try:
worker_folder = current_thread().worker_folder
fName = self.get()
rel_path = os.path.relpath(fName,worker_folder)
run_CEA(rel_path,worker_folder)
except Exception as exc:
print('ERROR: Worker failed on task\n\tFolder:{}\n\tFile:{}\n\t{}'.format(worker_folder,fName,exc))
finally:
self.task_done()
with self.lock:
self.n_complete+=1
current_progress = self.progress()
if (self.last_update==0 or current_progress==100. or
current_progress-self.last_update>=self.update_every):
print('\tCurrent progress: {:>6.2f}%'.format(current_progress))
self.last_update = current_progress
def run(self):
inp_files = self.schedule_tasks()
self.start_workers()
self.join()
out_files = self.cleanup()
return out_files
def tests(self,n):
inp_str = """! EXAMPLE 1
! (a) Assigned-temperature-and-pressure problem (tp).
! (b) Reactants are H2 and Air. Since "exploded ll formulas are not given,
! these formulas will be taken from the thermodynamic data library,
! thermo. lib.
! (c) Calculations are for two equivalence ratios (r,eq.ratio =1,1.5) .
! (d) Assigned pressures are I, 0.1, and 0.01 atm (p(atm)=l, .1, .01).
! (d) Assigned temperatures are 3000 and 2000 K (t(k)=3000,2000).
! (f) 'only' dataset is used to restrict possible products.
! (g) Energy units in the final tables are in calories (calories).
problem case=Example-1 tp p(atm)=1,.1,.01, t(k)=3000,2000,
r,eq.ratio=1,1.5
reac
fuel= H2 moles = 1.
oxid= Air moles = 1.
only Ar C CO CO2 H H2 H2O HNO HO2 HNO2 HNO3 N NH
NO N2 N2O3 O O2 OH O3
output calories
end
"""
self.setup_folders()
for i in range(n):
fName = 'test{:0>4}'.format(i)
fName = os.path.abspath(os.path.join(self.inp_folder,fName+'.inp'))
f = open(fName,'w')
f.write(inp_str)
f.close()
return self.run()
if __name__ == "__main__":
if True:
import time
start_time = time.time()
Q = CEA_Queue(12)
out_files = Q.tests(10_000)
end_time = time.time()
print('Processing took {:5.2f}'.format(end_time-start_time))
On my 8 core machine the sweet spot is at about 12 threads. Below is an example curve comparing runtime to number of threads handling the workload for a problem.
from multiprocessing import Pool
from functools import partial
from time import sleep
import random
import string
import uuid
import os
import glob
def task_a(param1, param2, mydata):
thread_id = str(uuid.uuid4().hex) # this may not be robust enough to guarantee no collisions, address
output_filename = ''.join([str(thread_id),'.txt'])
# part 1 - create output file for task_b to use
with open(output_filename, 'w') as outfile:
for line in mydata:
outfile.write(line)
# part 2 - do some extra stuff (whilst task_b is running)
sleep(5)
print('Task A finished')
return output_filename # not interested in return val
def task_b(expected_num_files):
processed_files = 0
while processed_files<expected_num_files:
print('I am task_b, waiting for {} files ({} so far)'.format(expected_num_files, processed_files))
path_to_search = ''
for filename in glob.iglob(path_to_search + '*.txt', recursive=True):
print('Got file : {}'.format(filename))
# would do something complicated here
os.rename(filename, filename+'.done')
processed_files+=1
sleep(10)
if __name__ == '__main__':
param1 = '' # dummy variable, need to support in solution
param2 = '' # dummy variable, need to support in solution
num_workers = 2
full_data = [[random.choice(string.ascii_lowercase) for _ in range(5)] for _ in range(100)]
print(full_data)
for i in range(0, len(full_data), num_workers):
print('Going to process {}'.format(full_data[i:i+num_workers]))
p = Pool(num_workers)
task_a_func = partial(task_a, param1, param2)
results = p.map(task_a_func, full_data[i:i+num_workers])
p.close()
p.join()
task_b(expected_num_files=num_workers) # want this running sooner
print('Iteration {} complete'.format(i))
#want to wait for task_a's and task_b to finish
I'm having trouble scheduling these tasks to run concurrently.
task_a is a multiprocessing pool that produces an output file part way through it execution.
task_b MUST process the output files sequentially can be in any order (can be as soon as they are available), WHILST task_a continues to run (it will no longer change the output file)
The next iteration must only start when both all task_a's have completed AND task_b has completed.
The toy code I have posted obviously waits for task_a's to fully complete before task_b is started (which is not what I want)
I have looked at multiprocessing / subprocess etc. but cannot find a way to launch both the pool and the single task_b process concurrently AND wait for BOTH to finish.
task_b is written as if it could be changed to an external script, but I am still stuck on how manage the execution.
Should I effectively merge code from task_b into task_a and somehow pass a flag to ensure one worker per pool 'runs the task_b code' via a if/else - at least then I would just be waiting on the pool to complete?
You can use an interprocess queue to communicate the filenames between task a and task b.
Also, initializing pool repeatedly inside the loop is harmful and unnecessarily slow.
Its better to initialize the pool once in the beginning.
from multiprocessing import Pool, Manager, Event
from functools import partial
from time import sleep
import random
import string
import uuid
import os
import glob
def task_a(param1, param2, queue, mydata):
thread_id = str(uuid.uuid4().hex)
output_filename = ''.join([str(thread_id),'.txt'])
output_filename = 'data/' + output_filename
with open(output_filename, 'w') as outfile:
for line in mydata:
outfile.write(line)
print(f'{thread_id}: Task A file write complete for data {mydata}')
queue.put(output_filename)
print('Task A finished')
def task_b(queue, num_workers, data_size, event_task_b_done):
print('Task b started!')
processed_files = 0
while True:
filename = queue.get()
if filename == 'QUIT':
# Whenever you want task_b to quit, just push 'quit' to the queue
print('Task b quitting')
break
print('Got file : {}'.format(filename))
os.rename(filename, filename+'.done')
processed_files+=1
print(f'Have processed {processed_files} so far!')
if (processed_files % num_workers == 0) or (processed_files == data_size):
event_task_b_done.set()
if __name__ == '__main__':
param1 = '' # dummy variable, need to support in solution
param2 = '' # dummy variable, need to support in solution
num_workers = 2
data_size = 100
full_data = [[random.choice(string.ascii_lowercase) for _ in range(5)] for _ in range(data_size)]
mgr = Manager()
queue = mgr.Queue()
event_task_b_done = mgr.Event()
# One extra worker for task b
p = Pool(num_workers + 1)
p.apply_async(task_b, args=(queue, num_workers, data_size, event_task_b_done))
task_a_func = partial(task_a, param1, param2, queue)
for i in range(0, len(full_data), num_workers):
data = full_data[i:i+num_workers]
print('Going to process {}'.format(data))
p.map_async(task_a_func, full_data[i:i+num_workers])
print(f'Waiting for task b to process all {num_workers} files...')
event_task_b_done.wait()
event_task_b_done.clear()
print('Iteration {} complete'.format(i))
queue.put('QUIT')
p.close()
p.join()
exit(0)
I have a cluster of computers which uses a master node to communicate with the slave nodes in the cluster.
The main problem I'm facing is using execnet is being able to kill certain jobs that are running and then having new jobs requeue on the same core that the other job just got terminated on (as I want to utilize all cores of the slave nodes at any given time).
As of now there is no way to terminate running jobs using execnet, so I figured if I could just kill the jobs manually through a bash script, say sudo kill 12345 where 12345 is the PID of the job (obtaining the PID of each job is another thing not supported by execnet, but that's another topic), then it would terminate the job and then requeue another on the same core that was just terminated on. It does kill the job correctly, however it closes the connection to that channel (the core; the master node communicates to each core individually) and then does not utilize that core anymore, until all jobs are done. Is there a way to terminate a running job, without killing the connection to the core?
Here is the script to submit jobs
import execnet, os, sys
import re
import socket
import numpy as np
import pickle, cPickle
from copy import deepcopy
import time
import job
def main():
print 'execnet source files are located at:\n {}/\n'.format(
os.path.join(os.path.dirname(execnet.__file__))
)
# Generate a group of gateways.
work_dir = '/home/mpiuser/pn2/'
f = 'cluster_core_info.txt'
n_start, n_end = 250000, 250008
ci = get_cluster_info(f)
group, g_labels = make_gateway_group(ci, work_dir)
mch = group.remote_exec(job)
args = range(n_start, n_end+1) # List of parameters to compute factorial.
manage_jobs(group, mch, queue, g_labels, args)
# Close the group of gateways.
group.terminate()
def get_cluster_info(f):
nodes, ncores = [], []
with open(f, 'r') as fid:
while True:
line = fid.readline()
if not line:
fid.close()
break
line = line.strip('\n').split()
nodes.append(line[0])
ncores.append(int(line[1]))
return dict( zip(nodes, ncores) )
def make_gateway_group(cluster_info, work_dir):
''' Generate gateways on all cores in remote nodes. '''
print 'Gateways generated:\n'
group = execnet.Group()
g_labels = []
nodes = list(cluster_info.keys())
for node in nodes:
for i in range(cluster_info[node]):
group.makegateway(
"ssh={0}//id={0}_{1}//chdir={2}".format(
node, i, work_dir
))
sys.stdout.write(' ')
sys.stdout.flush()
print list(group)[-1]
# Generate a string 'node-id_core-id'.
g_labels.append('{}_{}'.format(re.findall(r'\d+',node)[0], i))
print ''
return group, g_labels
def get_mch_id(g_labels, string):
ids = [x for x in re.findall(r'\d+', string)]
ids = '{}_{}'.format(*ids)
return g_labels.index(ids)
def manage_jobs(group, mch, queue, g_labels, args):
args_ref = deepcopy(args)
terminated_channels = 0
active_jobs, active_args = [], []
while True:
channel, item = queue.get()
if item == 'terminate_channel':
terminated_channels += 1
print " Gateway closed: {}".format(channel.gateway.id)
if terminated_channels == len(mch):
print "\nAll jobs done.\n"
break
continue
if item != "ready":
mch_id_completed = get_mch_id(g_labels, channel.gateway.id)
depopulate_list(active_jobs, mch_id_completed, active_args)
print " Gateway {} channel id {} returned:".format(
channel.gateway.id, mch_id_completed)
print " {}".format(item)
if not args:
print "\nNo more jobs to submit, sending termination request...\n"
mch.send_each(None)
args = 'terminate_channel'
if args and \
args != 'terminate_channel':
arg = args.pop(0)
idx = args_ref.index(arg)
channel.send(arg) # arg is copied by value to the remote side of
# channel to be executed. Maybe blocked if the
# sender queue is full.
# Get the id of current channel used to submit a job,
# this id can be used to refer mch[id] to terminate a job later.
mch_id_active = get_mch_id(g_labels, channel.gateway.id)
print "Job {}: {}! submitted to gateway {}, channel id {}".format(
idx, arg, channel.gateway.id, mch_id_active)
populate_list(active_jobs, mch_id_active,
active_args, arg)
def populate_list(jobs, job_active, args, arg_active):
jobs.append(job_active)
args.append(arg_active)
def depopulate_list(jobs, job_completed, args):
i = jobs.index(job_completed)
jobs.pop(i)
args.pop(i)
if __name__ == '__main__':
main()
and here is my job.py script:
#!/usr/bin/env python
import os, sys
import socket
import time
import numpy as np
import pickle, cPickle
import random
import job
def hostname():
return socket.gethostname()
def working_dir():
return os.getcwd()
def listdir(path):
return os.listdir(path)
def fac(arg):
return np.math.factorial(arg)
def dump(arg):
path = working_dir() + '/out'
if not os.path.exists(path):
os.mkdir(path)
f_path = path + '/fac_{}.txt'.format(arg)
t_0 = time.time()
num = fac(arg) # Main operation
t_1 = time.time()
cPickle.dump(num, open(f_path, "w"), protocol=2) # Main operation
t_2 = time.time()
duration_0 = "{:.4f}".format(t_1 - t_0)
duration_1 = "{:.4f}".format(t_2 - t_1)
#num2 = cPickle.load(open(f_path, "rb"))
return '--Calculation: {} s, dumping: {} s'.format(
duration_0, duration_1)
if __name__ == '__channelexec__':
channel.send("ready")
for arg in channel:
if arg is None:
break
elif str(arg).isdigit():
channel.send((
str(arg)+'!',
job.hostname(),
job.dump(arg)
))
else:
print 'Warnning! arg sent should be number | None'
Yes, you are on the right track. Use psutil library to manage the processes, find their pids etc.
And kill them. No need for involveing bash anywhere. Python covers it all.
Or, even better, program your script to terminate when master say so.
It is usually done that way.
You can even make it start another script before terminating itself if you want/need.
Or, if it is the same that you would be doing in another process, just stop the current work and start a new one in the script without terminating it at all.
And, if I may make a suggestion. Don't read your file line by line, read a whole file and then use *.splitlines(). For small files reading them in chunks just tortures the IO. You wouldn't be needing *.strip() as well. And you should remove unused imports too.
I'm working with Logfiles right now. My need is I want to read a file line by line for a specified period of time, say 10s. Can anybody help me if there is a way to accomplish this in Python?
Run tail or tac using Popen and iterate over output until you find a line you want to stop. Here is a example snippet.
filename = '/var/log/nginx/access.log'
# Command to read file from the end
cmd = sys.platform == 'darwin' and ['tail', '-r', filename] or ['tac', filename]
# But if you want read it from beginning, use the following
#cmd = ['cat', filename]
proc = Popen(cmd, close_fds=True, stdout=PIPE, stderr=PIPE)
output = proc.stdout
FORMAT = [
# 'foo',
# 'bar',
]
def extract_log_data(line):
'''Extact data in you log format, normalize it.
'''
return dict(zip(FORMAT, line))
csv.register_dialect('nginx', delimiter=' ', quoting=csv.QUOTE_MINIMAL)
lines = csv.reader(output, dialect='nginx')
started_at = dt.datetime.utcnow()
for line in lines:
data = extract_log_data(line)
print data
if (dt.datetime.utcnow() - started_at) >= dt.timedelta(seconds=10):
break
output.close()
proc.terminate()
Code
from multiprocessing import Process
import time
def read_file(path):
try:
# open file for writing
f = open(path, "r")
try:
for line in f:
# do something
pass
# always close the file when leaving the try block
finally:
f.close()
except IOError:
print "Failed to open/read from file '%s'" % (path)
def read_file_limited_time(path, max_seconds):
# init Process
p = Process(target=read_file, args=(path,))
# start process
p.start()
# for max seconds
for i in range(max_seconds):
# sleep for 1 seconds (you may change the sleep time to suit your needs)
time.sleep(1)
# if process is not alive, we can break the loop
if not p.is_alive():
break
# if process is still alive after max_seconds, kiil it!
if p.is_alive():
p.terminate()
def main():
path = "f1.txt"
read_file_limited_time(path,10)
if __name__ == "__main__":
main()
Notes
The reason why we "wake up" every 1 second and check whether the process we started is still alive is just to prevent us from keep sleeping when the process has finished. time wasting to sleep for 9 seconds if the process ended after 1 second.