I've researched many pool.map on SO and still can't seem to find anything that hints at my issue.
I have if __name__ == '__main__' in every .py file. I have freeze_support() in each .py that contains import multiprocessing, I am still at a loss for what is happening. I've moved the freeze_support() around in my code with the same unsuccessful results.
Script A calls Script B, Script B calls Script C (where the multiprocessing happens). Locally this scenario works perfectly, but when I load it to a Windows Server 2008 machine, strange things start happening.
On the server I can see the first iterable printed to the interpreter, but it then jumps back to Script B and keeps processing. There are 51 other items in the list for Script C.
Script B Code:
if not arcpy.Exists(MergedDataFC):
ScriptC.intersect_main(input1, input2)
if not arcpy.Exists(MergedDataSHP):
shpList = arcpy.ListFields(*.shp) # output of multiprocess
# Merge all shapefiles into single shapefile
# Being executed before the multiprocess finishes all 52 items
Script C Code:
import multiprocessing as mp
def intersect_main(input1,input2):
try:
mp.freeze_support()
# Create a list of states for input1 polygons
log.log("Creating Polygon State list...")
fldList = arcpy.ListFields(input1)
flds = [fld.name for fld in fldList]
idList = []
with arcpy.da.SearchCursor(input1, flds) as cursor:
for row in cursor:
idSTATE = row[flds.index("STATE")]
idList.append(idSTATE)
idList = set(idList)
log.log("There are " + str(len(idList)) + " States (polygons) to process.")
log.log("Sending to pool")
# declare number of cores to use, use 1 less than the max
cpuNum = mp.cpu_count() -1
# Create the pool object
pool = mp.Pool(processes=cpuNum)
# Fire off list to worker function.
# res is a list that is created with what ever the worker function is returning
log.log ("Entering intersectWork")
res = pool.map((intersectWork(input1, input2, idSTATE)),idList)
pool.close()
pool.join()
# If an error has occurred report it
if False in res:
log.log ("A worker failed!")
log.log (strftime('[%H:%M:%S]', localtime()))
raise Exception
else:
log.log("Finished multiprocessing!")
log.log (strftime('[%H:%M:%S]', localtime()))
except Exception, e:
tb = sys.exc_info()[2]
# Geoprocessor threw an error
log.log("An error occurred on line " + str(tb.tb_lineno))
log.log (str(e))
def intersectWork(input1,input2, idSTATE):
try:
if idSTATE == None:
query = "STATE IS NULL"
idSTATE = 'pr'
else:
query = "STATE = '" + idSTATE + "'"
DEMOlayer = arcpy.MakeFeatureLayer_management(input1,"input1_" + idSTATE)
log.log (query)
arcpy.SelectLayerByAttribute_management(DEMOlayer,"NEW_SELECTION",query)
# Do the Intersect
outFC = r'C:/EclipseWorkspace' + '/INTER_' + idSTATE.upper() + '.shp'
strIntersect = str(DEMOlayer) + ";" + str(input2)
arcpy.Intersect_analysis(strIntersect, outFC, "ALL", "", "LINE")
return True
except:
# Some error occurred so return False
log.log(arcpy.GetMessage(2))
return False
if __name__ == '__main__':
intersect_main(input1, input2)
Edit
All the data on the server is stored locally, no across network processing.
The issue was the full path to the data wasn't being properly passed into the pool.map() on the server, from previous modules. I had to add all the files paths under the import statements. Not very elegant looking, but it's working.
Related
May be my question looks simple (or the Bug might be minor) but I could not be able find the bug, really I struggled a lot to figure out the issue.
I've created a Framework to Extract the Data from Salesforce with Simple Salesforce package, but I've encountered with the bug when I'm using multiprocessing.
My code is pretty much straight forward but tedious. I don't want to paste entire code here, So here is my Code from my GitHub.
Issue:
When I'm calling this Extract Data function with Pool, the variable which is there in the __name__ == '__main__' is not working.
In Other words I'm getting, NameError: name 'SFAPI' is not defined - But It's there in the main as Global Variable and It's working without pool (A single call).
Execution Example:
python "E:\Documents\myPy\SF Project\sf_extraction.py" -pr data_extraction -tn Opportunity Account
Small Snippet from my code, where I'm getting issues:
def ExtractData(table_name):
logging.info('Extract Data for Table Name: ' + table_name + ' at ' + getCurrDatetime())
try:
rec_count = getRecordCount(table_name)
print(rec_count)
if int(rec_count) == 0:
logging.info('There is no data to Extract for {}'.format(table_name))
else:
soql = SFAPI.CreateSOQL(table_name)
data = SFAPI.ExecuteSOQL(soql, is_count=0)
extract_file_nm = table_name + '_' + db_name + '_' + sc_name + '_' + curr_datetime + '.csv'
print(data)
print(type(data))
extract_file = os.path.expanduser(os.path.join(script_path,extract_file_nm))
data.to_csv(extract_file, index=False)
logging.info('Data has been extrcated as {} at {}'.format(extract_file, getCurrDatetime()))
except Exception as e:
logging.info('Error in Extraction')
err_msg = "FATAL_ERROR: In the ExtractData Function : {0}\n\n{1}".format(e, traceback.format_exc())
raise Exception(str(err_msg))
Place or Snippet from where I'm calling this:
if __name__ == '__main__':
try:
SFAPI = SalesforceAPICall(username=config['username'],
password=config['password'],
security_token=config['sf_token'],
)
if len(table_name) != 0 and 'data_extraction' in process_nm:
try:
if len(table_name) == 1:
print(table_name[0])
ExtractData(table_name[0])
if type(table_name) == list and len(table_name) > 1:
#p = Pool(processes=int(processes))
print('Calling Pool : ' + str(os.cpu_count()))
#out = p.map(ExtractData, table_name)
#p.close()
#p.join()
p = Pool()
print(table_name)
x = p.map(ExtractData, table_name)
x.get()
p.close()
p.join()
except Exception as e:
if len(table_name) > 1:
p.terminate()
p.join()
logging.error("Process Failed - " + str(e))
except Exception as e:
chk_err('FATAL_ERROR: ' + " from main exception : {0}\n\n{1}".format(e, traceback.format_exc()))
You can very well refer my code in GitHub if it looks clumsy or if you feel this is not enough amount of information to fix.
Again it might be a small Bug, Hope You Understand what I'm trying to convey !!! Thanks in
Advance !!!
Regards,
Parvathirajan Natarajan
I am currently making a http request that has over 3,000,000 records by using the pagination method. Sometimes the call fails because of a 104 server error, so I retry and it works on the second or third time.
Because there are so many requests, I am using the multiprocess function in python to speed this along. I'm using a ubuntu 16 machine, python3.5 and an 8 core machine. The odd thing here is that all the files get written, and the process "Finishes" i.e reaches the end of the range (regardless of the size so 1 million or 2 million or 3 million) but it wont pass the pool line. So my tmux sessions just says "Working on date (lastrecordnumber)" I need that to occur so I can send an email to let me know the task has finished.
I've tried pool.map(); pool.aysnc(); pool.map_async(), they all seem to have the same issue.
import http.client
from multiprocessing import Pool
from functools import partial
def get_raw_data(auth, url_conn, skip):
headers = {'authorization': "Basic {}".format(auth)}
sucess = None
loop = 0
while not sucess:
try:
conn = http.client.HTTPSConnection(url_conn)
conn.request("GET", "SOME_API&$skip={}".format(skip), headers=headers)
res = conn.getresponse()
data = res.read()
raw_data = json.loads(data.decode("utf-8"))
sucess = 'yes'
except Exception as e:
print('stuck in loop {} {} {}'.format(skip, loop, e))
loop += 1
with open('{}.json'.format(skip), 'w') as outfile:
json.dump(raw_data, outfile)
def process_skips(skip):
print('Working on date {}'.format(skip))
get_raw_data(skip)
if __name__ == '__main__':
print("We started at {}".format(dt.datetime.now()))
n = range(0,3597351,5000)
n = list(n)
pool = Pool(8)
pool.map_async(process_skips, n)
pool.close()
pool.join()
Using pool as a context manager using with which takes care of closing/joining the processes and seems to be the preferred method in the docs.
if __name__ == '__main__':
print("We started at {}".format(dt.datetime.now()))
n = list(range(0,3597351,5000))
with Pool(8) as pool:
pool.map_async(process_skips, n)
If your main process is working and writing your file correctly, that should make your processes close out correctly.
So I have been playing around with Multiprocessing and I was thinking to upgrade my knowledge where I can read the first sentence from the text file for process 1 then the second sentence for process 2 etc...
txt file:
helloworld#world.com
helloworld2#world.com
helloworld3#world.com
helloworld4#world.com
helloworld5#world.com
and this is how the code is looking:
def info(thread):
global prod
prod = int(thread) + 1
runit()
def runit():
log("Profile-" + str(prod) + Fore.GREEN + ' - ' + email)
#From here I can then use the email for each worker basically. Or thats the plan atleast. Theplan is that every worker will have its own email that can be used in here.
sys.exit()
def main():
user_input = 0
while True:
try:
user_input = int(input(Fore.WHITE + 'How many tasks do you wanna run? [NUMBERS] \n' + Fore.RESET))
except ValueError:
print(Fore.RED + "Stop being stupid" + Fore.RESET)
continue
else:
with open('email.txt') as f:
content = f.readlines()
content = [x.strip('\n') for x in content]
try:
for i, email in enumerate(content):
print(email)
except ValueError as e:
print(e)
HowManyThread = user_input
i = 0
jobs = []
for i in range(HowManyThread):
p = multiprocessing.Process(target=info, args=(str(i),))
jobs.append(p)
time.sleep(.5)
p.start()
for p in jobs:
p.join()
sys.exit()
Log is just a log message basically, Nothing special
Fore.COLOR <-- Colorama
However, I have completely no idea what I should do to actually make each process take each email row. So basically....
Process-1 to take helloworld#world.com
Process-2 to take helloworld2#world.com
Process-3 to take helloworld3#world.com
Process-4 to take helloworld4#world.com
Process-5 to take helloworld5#world.com
What are the suggestions on how I can do this? I'm completely off and have absolutely no idea on how to move forward.
Update
from multiprocessing import pool, Process, Queue
from tqdm import tqdm
with open('email.txt') as f:
content = f.readlines()
global email_list
email_list = [x.strip('\n') for x in content]
def info(thread):
global prod
prod = int(thread) + 1
runit()
def runit(email_index):
email = email_list[email_index]
log("Profile-" + str(prod) + Fore.GREEN + ' - ' + email)
sys.exit()
def main():
wipe()
text()
Infotext = "First name : Last name : Email: : Random char + Street"
with open('data.json', 'w') as f:
json.dump(Infotext, f)
f.write("\n")
with Pool(8) as pool:
result_list = list(tqdm(pool.imap_unordered(, range(len(email_list)), chunksize=5), total=len(email_list))))
if __name__ == '__main__':
try:
main()
except Exception as e:
print(e)
print(traceback.print_exc())
print(traceback)
The following approach delegates the multiprocessing to a pool of workers, each of which receives a chunk of indices and processes these indices a single line at a time (the choice of poolsize=8 and chunksize=5 here is arbitrary and can be tuned according to your requirements).
The result of all workers is then collected into a final list. Note that imap_unordered is only appropriate if you don't care about the order in which the lines are processed (i.e. result_list does not maintain the original order of content.
from multiprocessing import Pool
# progress bar to track your multiproc
from tqdm import tqdm
with open('email.txt') as f:
content = f.readlines()
# this list will be accessed by each worker
global email_list
email_list = [x.strip('\n') for x in content]
# define function that worker will apply to each email
# it gets sent an index for the list of emails
# it accesses the email at that index, performs its function and returns
def runit(email_index):
email = email_list[email_index]
# do the stuff you're interested in for a single email
# run the multiprocessing to get your results
# this sends the indexes for the emails out to the workers
# and collects the results of runit into result list
with Pool(8) as pool:
result_list = list(tqdm(pool.imap_unordered(runit,
range(len(email_list)), chunksize=5),
total=len(email_list)))
What you need is a pool of worker processes - even if for you use case, I really wonder whether threads (or multiprocessing.dummy) would not be enough.
A pool starts the asked number of worker processes, and you can submit asynchronous tasks to the pool that will be handled by the first free worked.
A stripped down version of your example (no fancy printing, no unnecessary reading of a sequential file in a list) could be:
import multiprocessing
import time
def runit(prod, email):
print("Profile-" + str(prod) + ' - ' + email)
#From here I can then use the email for each worker basically. Or thats the plan atleast. Theplan is that every worker will have its own email that can be used in here.
# sys.exit() # NEVER CALL EXPLICITELY sys.exit() in a worker process
time.sleep(1) # to add a delay inside each task
def main():
while True:
try:
HowManyThread = int(input(
'How many tasks do you wanna run? [NUMBERS] \n'))
except ValueError:
print("Stop being stupid")
continue
if HowManyThread == 0: break
pool = multiprocessing.Pool(HowManyThread)
with open('email.txt') as f:
for i, email in enumerate(f):
email = email.strip()
# runit will be runned by a worker process
pool.apply_async(runit, (i, email))
pool.close() # no more task to add
pool.join() # wait for last worker to end
if __name__ == "__main__":
main()
I have a cluster of computers which uses a master node to communicate with the slave nodes in the cluster.
The main problem I'm facing is using execnet is being able to kill certain jobs that are running and then having new jobs requeue on the same core that the other job just got terminated on (as I want to utilize all cores of the slave nodes at any given time).
As of now there is no way to terminate running jobs using execnet, so I figured if I could just kill the jobs manually through a bash script, say sudo kill 12345 where 12345 is the PID of the job (obtaining the PID of each job is another thing not supported by execnet, but that's another topic), then it would terminate the job and then requeue another on the same core that was just terminated on. It does kill the job correctly, however it closes the connection to that channel (the core; the master node communicates to each core individually) and then does not utilize that core anymore, until all jobs are done. Is there a way to terminate a running job, without killing the connection to the core?
Here is the script to submit jobs
import execnet, os, sys
import re
import socket
import numpy as np
import pickle, cPickle
from copy import deepcopy
import time
import job
def main():
print 'execnet source files are located at:\n {}/\n'.format(
os.path.join(os.path.dirname(execnet.__file__))
)
# Generate a group of gateways.
work_dir = '/home/mpiuser/pn2/'
f = 'cluster_core_info.txt'
n_start, n_end = 250000, 250008
ci = get_cluster_info(f)
group, g_labels = make_gateway_group(ci, work_dir)
mch = group.remote_exec(job)
args = range(n_start, n_end+1) # List of parameters to compute factorial.
manage_jobs(group, mch, queue, g_labels, args)
# Close the group of gateways.
group.terminate()
def get_cluster_info(f):
nodes, ncores = [], []
with open(f, 'r') as fid:
while True:
line = fid.readline()
if not line:
fid.close()
break
line = line.strip('\n').split()
nodes.append(line[0])
ncores.append(int(line[1]))
return dict( zip(nodes, ncores) )
def make_gateway_group(cluster_info, work_dir):
''' Generate gateways on all cores in remote nodes. '''
print 'Gateways generated:\n'
group = execnet.Group()
g_labels = []
nodes = list(cluster_info.keys())
for node in nodes:
for i in range(cluster_info[node]):
group.makegateway(
"ssh={0}//id={0}_{1}//chdir={2}".format(
node, i, work_dir
))
sys.stdout.write(' ')
sys.stdout.flush()
print list(group)[-1]
# Generate a string 'node-id_core-id'.
g_labels.append('{}_{}'.format(re.findall(r'\d+',node)[0], i))
print ''
return group, g_labels
def get_mch_id(g_labels, string):
ids = [x for x in re.findall(r'\d+', string)]
ids = '{}_{}'.format(*ids)
return g_labels.index(ids)
def manage_jobs(group, mch, queue, g_labels, args):
args_ref = deepcopy(args)
terminated_channels = 0
active_jobs, active_args = [], []
while True:
channel, item = queue.get()
if item == 'terminate_channel':
terminated_channels += 1
print " Gateway closed: {}".format(channel.gateway.id)
if terminated_channels == len(mch):
print "\nAll jobs done.\n"
break
continue
if item != "ready":
mch_id_completed = get_mch_id(g_labels, channel.gateway.id)
depopulate_list(active_jobs, mch_id_completed, active_args)
print " Gateway {} channel id {} returned:".format(
channel.gateway.id, mch_id_completed)
print " {}".format(item)
if not args:
print "\nNo more jobs to submit, sending termination request...\n"
mch.send_each(None)
args = 'terminate_channel'
if args and \
args != 'terminate_channel':
arg = args.pop(0)
idx = args_ref.index(arg)
channel.send(arg) # arg is copied by value to the remote side of
# channel to be executed. Maybe blocked if the
# sender queue is full.
# Get the id of current channel used to submit a job,
# this id can be used to refer mch[id] to terminate a job later.
mch_id_active = get_mch_id(g_labels, channel.gateway.id)
print "Job {}: {}! submitted to gateway {}, channel id {}".format(
idx, arg, channel.gateway.id, mch_id_active)
populate_list(active_jobs, mch_id_active,
active_args, arg)
def populate_list(jobs, job_active, args, arg_active):
jobs.append(job_active)
args.append(arg_active)
def depopulate_list(jobs, job_completed, args):
i = jobs.index(job_completed)
jobs.pop(i)
args.pop(i)
if __name__ == '__main__':
main()
and here is my job.py script:
#!/usr/bin/env python
import os, sys
import socket
import time
import numpy as np
import pickle, cPickle
import random
import job
def hostname():
return socket.gethostname()
def working_dir():
return os.getcwd()
def listdir(path):
return os.listdir(path)
def fac(arg):
return np.math.factorial(arg)
def dump(arg):
path = working_dir() + '/out'
if not os.path.exists(path):
os.mkdir(path)
f_path = path + '/fac_{}.txt'.format(arg)
t_0 = time.time()
num = fac(arg) # Main operation
t_1 = time.time()
cPickle.dump(num, open(f_path, "w"), protocol=2) # Main operation
t_2 = time.time()
duration_0 = "{:.4f}".format(t_1 - t_0)
duration_1 = "{:.4f}".format(t_2 - t_1)
#num2 = cPickle.load(open(f_path, "rb"))
return '--Calculation: {} s, dumping: {} s'.format(
duration_0, duration_1)
if __name__ == '__channelexec__':
channel.send("ready")
for arg in channel:
if arg is None:
break
elif str(arg).isdigit():
channel.send((
str(arg)+'!',
job.hostname(),
job.dump(arg)
))
else:
print 'Warnning! arg sent should be number | None'
Yes, you are on the right track. Use psutil library to manage the processes, find their pids etc.
And kill them. No need for involveing bash anywhere. Python covers it all.
Or, even better, program your script to terminate when master say so.
It is usually done that way.
You can even make it start another script before terminating itself if you want/need.
Or, if it is the same that you would be doing in another process, just stop the current work and start a new one in the script without terminating it at all.
And, if I may make a suggestion. Don't read your file line by line, read a whole file and then use *.splitlines(). For small files reading them in chunks just tortures the IO. You wouldn't be needing *.strip() as well. And you should remove unused imports too.
I'm trying to speed up some data processing using the multiprocessing module, the idea being I can send a chunk of data to each process I start up to utilize all the cores on my machine instead of just one at a time.
So I built an iterator for the data using the pandas read_fwf() function, with chunksize=50000 lines at a time. My problem is that eventually the iterator should raise StopIteration, and I'm trying to catch this in an except block in the child process and pass it along to the parent thread using a Queue to let the parent know it can stop spawning child processes. I have no idea what's wrong though, but what's happening is it gets to the end of the data and then keeps spawning processes which essentially do nothing.
def MyFunction(data_iterator, results_queue, Placeholder, message_queue):
try:
current_data = data_iterator.next()
#does other stuff here
#that isn't important
placeholder_result = "Eggs and Spam"
results_queue.put(placeholder_result)
return None
except StopIteration:
message_queue.put("Out Of Data")
return None
results_queue = Queue() #for passing results from each child process
message_queue = Queue() #for passing the stop iteration message
cpu_count = cpu_count() #num of cores on the machine
Data_Remaining = True #loop control
output_values = [] #list to put results in
print_num_records = 0 #used to print how many lines have been processed
my_data_file = "some_data.dat"
data_iterator = BuildDataIterator(my_data_file)
while Data_Remaining:
processes = []
for process_num in range(cpu_count):
if __name__ == "__main__":
p = Process(target=MyFunction, args=(data_iterator,results_queue,Placeholder, message_queue))
processes.append(p)
p.start()
print "Process " + str(process_num) + " Started" #print some stuff to
print_num_records = print_num_records + 50000 #show how far along
print "Processing records through: ", print_num_records #my data file I am
for i,p in enumerate(processes):
print "Joining Process " + str(i)
output_values.append(results_queue.get())
p.join(None)
if not message_queue.empty():
message = message_queue.get()
else:
message = ""
if message == "Out Of Data":
Data_Remaining = False
print "STOP ITERATION NOW PLEASE"
Update:
I discovered a problem with the data iterator. There are approximately 8 million rows in my data set, and after it processes the 8 million it never actually returns a StopIteration, it keeps returning the same 14 rows of data over and over. Here is the code that builds my data iterator:
def BuildDataIterator(my_data_file):
#data_columns is a list of 2-tuples
#headers is a list of strings
#num_lines is 50000
data_reader = read_fwf(my_data_file, colspecs=data_columns, header=None, names=headers, chunksize=num_lines)
data_iterator = data_reader.__iter__()
return data_iterator