os.chdir between multiple python processes - python

I have a complex python pipeline (which code I cant change), calling multiple other scripts and other executables. The point is it takes ages to run over 8000 directories, doing some scientific analyses. So, I wrote a simple wrapper, (might not be most effective, but seems to work) using the multiprocessing module.
from os import path, listdir, mkdir, system
from os.path import join as osjoin, exists, isfile
from GffTools import Gene, Element, Transcript
from GffTools import read as gread, write as gwrite, sort as gsort
from re import match
from multiprocessing import JoinableQueue, Process
from sys import argv, exit
# some absolute paths
inbase = "/.../abfgp_in"
outbase = "/.../abfgp_out"
abfgp_cmd = "python /.../abfgp-2.rev/abfgp.py"
refGff = "/.../B0510_manual_reindexed_noSeq.gff"
# the Queue
Q = JoinableQueue()
i = 0
# define number of processes
try: num_p = int(argv[1])
except ValueError: exit("Wrong CPU argument")
# This is the function calling the abfgp.py script, which in its turn calls alot of third party software
def abfgp(id_, pid):
out = osjoin(outbase, id_)
if not exists(out): mkdir(out)
# logfile
log = osjoin(outbase, "log_process_%s" %(pid))
try:
# call the script
system("%s --dna %s --multifasta %s --target %s -o %s -q >>%s" %(abfgp_cmd, osjoin(inbase, id_, id_ +".dna.fa"), osjoin(inbase, id_, "informants.mfa"), id_, out, log))
except:
print "ABFGP FAILED"
return
# parse the output
def extractGff(id_):
# code not relevant
# function called by multiple processes, using the Queue
def run(Q, pid):
while not Q.empty():
try:
d = Q.get()
print "%s\t=>>\t%s" %(str(i-Q.qsize()), d)
abfgp(d, pid)
Q.task_done()
except KeyboardInterrupt:
exit("Interrupted Child")
# list of directories
genedirs = [d for d in listdir(inbase)]
genes = gread(refGff)
for d in genedirs:
i += 1
indir = osjoin(inbase, d)
outdir = osjoin(outbase, d)
Q.put(d)
# this loop creates the multiple processes
procs = []
for pid in range(num_p):
try:
p = Process(target=run, args=(Q, pid+1))
p.daemon = True
procs.append(p)
p.start()
except KeyboardInterrupt:
print "Aborting start of child processes"
for x in procs:
x.terminate()
exit("Interrupted")
try:
for p in procs:
p.join()
except:
print "Terminating child processes"
for x in procs:
x.terminate()
exit("Interrupted")
print "Parsing output..."
for d in genedirs: extractGff(d)
Now the problem is, abfgp.py uses the os.chdir function, which seems to disrupt the parallel processing. I get a lot of errors, stating that some (input/output) files/directories cannot be found for reading/writing. Even though I call the script through os.system(), from which I though spawning separate processes would prevent this.
How can I work around these chdir interference?
Edit: I might change os.system() to subprocess.Popen(cwd="...") with the right directory. I hope this makes a difference.
Thanks.

Edit 2
Do not use os.system() use subprocess.call()
system("%s --dna %s --multifasta %s --target %s -o %s -q >>%s" %(abfgp_cmd, osjoin(inbase, id_, id_ +".dna.fa"), osjoin(inbase, id_, "informants.mfa"), id_, out, log))
would translate to
subprocess.call((abfgp_cmd, '--dna', osjoin(inbase, id_, id_ +".dna.fa"), '--multifasta', osjoin(inbase, id_, "informants.mfa"), '--target', id_, '-o', out, '-q')) # without log.
Edit 1
I think the problem is that multiprocessing is using the module names to serialize functions, classes.
This means if you do import module where module is in ./module.py and the you do something like os.chdir('./dir') now you would need to from .. import module.
The child processes inherit the folder of the parent process. This may be a problem.
Solutions
Make sure that all modules are imported (in the child processes) and after this you change the directory
insert the original os.getcwd() to sys.path to enable import from the original directory. This must be done before any functions are called from the local directory.
put all functions that you use inside a directory that can always be imported. The site-packages could be such a directory. Then you can do something like import module module.main() to start what you do.
This is a hack that I do because I know how pickle works. Only use this if other attempts fail.
The script prints:
serialized # the function runD is serialized
string executed # before the function is loaded the code is executed
loaded # now the function run is deserialized
run # run is called
In you case you would do something like this:
runD = evalBeforeDeserialize('__import__("sys").path.append({})'.format(repr(os.getcwd())), run)
p = Process(target=runD, args=(Q, pid+1))
This is the script:
# functions that you need
class R(object):
def __init__(self, call, *args):
self.ret = (call, args)
def __reduce__(self):
return self.ret
def __call__(self, *args, **kw):
raise NotImplementedError('this should never be called')
class evalBeforeDeserialize(object):
def __init__(self, string, function):
self.function = function
self.string = string
def __reduce__(self):
return R(getattr, tuple, '__getitem__'), \
((R(eval, self.string), self.function), -1)
# code to show how it works
def printing():
print('string executed')
def run():
print('run')
runD = evalBeforeDeserialize('__import__("__main__").printing()', run)
import pickle
s = pickle.dumps(runD)
print('serialized')
run2 = pickle.loads(s)
print('loaded')
run2()
Please report back if these do not work.

You could determine which instance of the os library the unalterable program is using; then create a tailored version of chdir in that library that does what you need -- prevent the directory change, log it, whatever. If the tailored behavior needs to be just for the single program, you can use the inspect module to identify the caller and tailor the behavior in a specific way for just that caller.
Your options are limited if you truly can't alter the existing program; but if you have the option of altering libraries it imports, something like this could be a least-invasive way to skirt the undesired behavior.
Usual caveats apply when altering a standard library.

Related

Stop one Python script that is running within another

I have a Python app that initiates from a main script, let's say a main.py. main.py (since my app is organized) references and imports other .py files within the same directory, that house other functions. As my app is continuously running, it imports such a function from another script, which is also supposed to run forever until it is explicitly cancelled.
Thing is, how would I cancel that specific script, while leaving its affected variables untouched and the main script/larger app still running?
I do not how I would go about targeting a specific function to stop its execution.
I use a kill function in my utils to kill any unneeded python process who's name I know. Note the following code was tested/works on Ubuntu Linux and Mac OS machines.
def get_running_pids(process_name):
pids = []
p = subprocess.Popen(['ps', '-A'], stdout=subprocess.PIPE)
out, err = p.communicate()
for line in out.splitlines():
if process_name in line.decode('utf-8'):
pid = int(line.decode('utf-8').split(None, 1)[0])
pids.append(pid)
return pids
def kill_process_with_name(process_name):
pids = get_running_pids(process_name)
for pid in pids:
os.kill(pid, signal.SIGKILL)
You Could set up user defined, custom, Exceptions. Extending Pythons builtin Exception object. Further reading here : Pythons User Defined Exceptions
CustomExceptions.py:
class HaltException(Exception):
pass
-
main.py:
from CustomExceptions import HaltException
class Functions():
def a(self):
print("hey")
self.b()
return "1"
def b(self):
print("hello")
raise HaltException()
def main():
func_obj = Functions()
try:
func_obj.a()
except HaltException as e:
pass
print("Awesome")
main()
Programs may name their own exceptions by creating a new exception
class (see Classes for more about Python classes). Exceptions should
typically be derived from the Exception class, either directly or
indirectly.

Using threading to run a subprocess in parallel

I a linux script that I'm looking to automate through subprocess. Each iteration of subprocess should run the linux script in each subdirectory of a parent directory, and each of these subprocesses should run in a separate thread.
The way my directory is organized is as follows:
/parent/p1
/parent/p2....and so on till
/parent/p[n]
The first part of my code aims to run the process across all the subdirectories (p1, p2, p3...etc). It works fine for a fast process. However, many of my jobs need to run in the background, for which I usually use nohup and manually run them on a separate node. So every node in my terminal will run the same job on each directory (p1, p2, p3..etc). The latter part of my code (using threading) aims to achieve this, but what ends up happening is every node runs the same process (p1,p1,p1...etc) - basically by entire 'jobs' function is being passed through runSims when I want them separated out over the threads. Would someone know how I could further iterate the threading function to place different jobs on each node?
import os
import sys
import subprocess
import os.path
import threading
#takes the argument: python FOLDER_NAME #ofThreads
#Example: python /parent 8
directory = sys.argv[1] #in my case input is /parent
threads = int(sys.argv[2]) #input is 8
category_name = directory.split('/')[-1] #splits parent as a word
folder_list = next(os.walk(directory))[1] #makes a list of subdirectories [p1,p2,p3..]
def jobs(cmd):
for i in folder_list:
f = open("/vol01/bin/dir/nohup.out", "w")
cmd = subprocess.call(['nohup','python','np.py','{0}/{1}' .format(directory,i)],cwd = '/vol01/bin/dir', stdout=f)
return cmd
def runSimThreads(numThreads):
threads = []
for i in range(numThreads):
t = threading.Thread(target=jobs, args=(i,))
threads.append(t)
t.start()
#Wait for all threads to complete
main_thread = threading.currentThread()
for t in threads:
if t is main_thread:
continue
t.join()
runSimThreads(threads)
That can't be your code.
import os
import sys
import subprocess
import os.path
import threading
#takes the argument: python FOLDER_NAME #ofThreads
#Example: python /parent 8
threads = 8 #input is 8
...
...
for t in threads:
print("hello")
--output:--
TypeError: 'int' object is not iterable
You are using the same variable names everywhere, and that is confusing you (or me?).
You also do this:
def jobs(cmd):
for i in folder_list:
f = open("/vol01/bin/dir/nohup.out", "w")
cmd = "something"
You are overwriting your cmd parameter variable, which means that jobs() shouldn't have a parameter variable.
Response to comment1:
import threading as thr
import time
def greet():
print("hello world")
t = thr.Thread(target=greet)
t.start()
t.join()
--output:--
hello world
import threading as thr
import time
def greet(greeting):
print(greeting)
t = thr.Thread(target=greet, args=("Hello, Newman.",) )
t.start()
t.join()
--output:--
Hello, Newman.
Below is the equivalent of what you are doing:
import threading as thr
import time
def greet(greeting):
greeting = "Hello, Jerry."
print(greeting)
t = thr.Thread(target=greet, args=("Hello, Newman.",) )
t.start()
t.join()
--output:--
Hello, Jerry.
And anyone reading that code would ask, "Why are you passing an argument to the greet() function when you don't use it?"
I'm relatively new to python
Well, your code does this:
threads = 8
#Other irrelevant stuff here
for t in threads:
print("hello")
and that will produce the error:
TypeError: 'int' object is not iterable
Do you know why?

import modules in python jobs with dispy

I'm working with a program that works in parallel execution with dispy.
I'm using dispy to create tasks and then distribute it to different CPUs to execution.
I have standar libraries and developed by me libraries (data and connection).
The code is like this:
import dispy
import sys
import data
import connection
def compute(num):
#some code that call data and connection methods, and generate a solution
return solution
def main():
cluster = dispy.JobCluster(compute)
jobs = []
for i in range(10)
job = cluster.submit(i)
job.id = i # optionally associate an ID to job (if needed later)
jobs.append(job)
for job in jobs:
job()
print "Result = " + str(job.result)
print "Exception = " + str(job.exception)
if __name__ == "__main__":
main()
`
The problem is that I need if a work with data and connection in the main def it works all fine, also if I call compute as a function instead of using the dispy library.
But when I work like that and in the compute procedure call a data function it throws and exception that data is not defined and print exception None.
Any help? The documentation suggests of use setup but I can't figure out how it works.
Put the import data call inside the compute function.
Dispy ships the function to call along with its arguments to the new process. The new process doesn't have data imported. That's why adding import data inside the function definition should fix this.
JobCluster(compute, depends=[data])
Specify that the comoute function depends on whichever modules you need.
If it is a module that you know that all machines have it installed, you can just import data,connections inside the compute function.
I know it is not elegant but is working for me and there are 2 options:
get rid of main function and put it in the if main block, because it is likely to be executed when function gets in cluster.
define all your module data inside one big function and pass it to the cluster, this is very simple way and yet powerfull.
import dispy
import sys
def compute(num):
def data_func1(json_):
#do something to json_
return json_
def data_func2(json_):
#do something diff
return json_
#some code that call data and connection methods, and generate a solution
return solution
if __name__ == "__main__":
cluster = dispy.JobCluster(compute)
jobs = []
for i in range(10)
job = cluster.submit(i)
job.id = i # optionally associate an ID to job (if needed later)
jobs.append(job)
for job in jobs:
job()
print "Result = " + str(job.result)
print "Exception = " + str(job.exception)
or define all your functions in script and pass all of then as depends at job cluster creation time like
import dispy
import sys
def data_func1(json_):
#do something to json_
return json_
def data_func2(json_):
#do something diff
return json_
class DataClass:
pass
def compute(num):
#some code that call data and connection methods, and generate a solution
return solution
if __name__ == "__main__":
cluster = dispy.JobCluster(compute, depends=[data_func1,
data_func2,
DataClass])
jobs = []
for i in range(10)
job = cluster.submit(i)
job.id = i # optionally associate an ID to job (if needed later)
jobs.append(job)
for job in jobs:
job()
print "Result = " + str(job.result)
print "Exception = " + str(job.exception)

Python multiprocessing module do not work

i am trying to write a spider with multiprocessing module
here is my python code:
# -*- coding:utf-8 -*-
import multiprocessing
import requests
class SpiderWorker(object):
def __init__(self, q):
self._q = q
def run(self):
def _crawl_item(url):
requests.get("http://www.baidu.com")
if respon.ok:
print respon.url
while True:
rst = self._q.get()
_crawl_item(rst)
def general_worker():
q = multiprocessing.Queue()
CPU_COUNT = multiprocessing.cpu_count()
worker_processes = [
multiprocessing.Process(target=SpiderWorker(q).run)
for i in range(CPU_COUNT)
]
map( lambda process: process.start(), worker_processes )
return q, worker_processes
maybe it is my process way wrong
every time i run this code, my process tell me
<Process(Process-1, stopped[SIGSEGV])>
hope love it
The major problem here is that you don't have any information on why your processes fail. It could be gevent, but it could just as easily be something else. So learning the actual reason why your processes get terminated is the first step before doing anything else.
What you need is multiprocessing.log_to_stderr():
class SpiderWorker(object):
# ...
def run(self):
logger = multiprocessing.log_to_stderr()
logger.setLevel(multiprocessing.SUBDEBUG)
try:
# Here goes your original run() code
except Exception:
logger.exception('whoopsie')
What this code does:
Creates a special logger which will transmit it's information to the main process and dump it to stderr (console by default).
Configures this logger to report everything, including some internal multiprocessing module events (just in case as you probably don't need them).
Wraps your entire code in catch-all statement so whatever happens there cannot escape your notice.
Runs .exception() method on the logger, which not only logs the message (it's meaningless anyway as we don't know what actually happens) but most importantly logs the entire error traceback - which we actually need.

Thread wait communication

I have a python script which calls plugins on a separate thread. The plugins must execute some commands and wait for a signal from the main script before they can proceed.
I used wait() in the plugin and set and clear in the calling script but I guess that once the plugin thread is called the main script waits for the thread to complete before continuing. Thus the set and wait are never called and the program hangs. I have attached a simplified version of code.
#!/usr/bin/python
import threading, os, sys, re, imp
e = threading.Event()
class PluginLoader():
# atexit.register(detective.terminate())
## getPlugins - Locate all plugins with plugin directory
# #param self Class object pointer
# #param moduleName Name of module
def getPlugins(self, moduleName):
try:
# Folder in which the plugins are stored
self.pluginFolder = "/tmp/plugins"
# Give the value of the main module in json file. (This is the name omitting the .py extension)
self.mainModule = moduleName[0].strip(" ")
# Load plugin array
plugins = []
possibleplugins = os.listdir(self.pluginFolder)
# Iterate over plugins to determine applicable plugin
for i in possibleplugins:
location=os.path.join(self.pluginFolder, i)
# Skip if not directory or plugin not in directory
if not os.path.isdir(location) or not self.mainModule + ".py" in os.listdir(location):
continue
# Otherwise, find the module
info = imp.find_module(self.mainModule, [location])
plugins.append({"name": i, "info": info})
return plugins
except OSError:
print "File or folder not found"
## loadPlugin - Load plugin into script
# #param self Class object pointer
# #param plugin Plugin object pointer
# #return Plugin object
def loadPlugin(self, plugin):
return imp.load_module(self.mainModule, *plugin["info"])
class Threads:
def run(self):
self.tuck()
raw_input("press entr")
e.set()
def tuck(self):
moduleName= ["hello"]
for i in pluginLoader.getPlugins(moduleName):
plugin = pluginLoader.loadPlugin(i)
threading.Thread(name = "block", target=plugin.run(e)).start()
e.set()
e.clear()
pluginLoader=PluginLoader()
t= Threads()
t.run()
the following script must be copied into a /tmp/plugins directory and named hello.py
#!/usr/bin/python
from thre import *
class wait:
flag = ""
def run(self,e):
self.flag = e
print "in main thread"
self.prints(e)
e.wait()
threading.Thread(target=self.prints12).start()
def prints(self,e):
for i in xrange(10):
print "world"
def prints12(self):
for i in xrange(10):
print "Hey"
w = wait()
def run (e) :
w.run(e)
The main problem seems to be here:
threading.Thread(name = "block", target=plugin.run(e)).start()
plugin.run(e) will directly call the run method of the plugin and block, therefore deadlocking your program. This probably should be:
threading.Thread(name = "block", target=plugin.run, args=(e,)).start()

Categories