I am currently working to translate Ray methods into dask and I wanted to know how to create five different "workers" for one class like the Ray program below does (run program in jupyter notebook):
So far i have a similar method in my dask version, but each "actor_" prints the same actor ID from the ping function. Could someone help with creating a similar product in Dask?
"""
#CURRENT VERSION
A simple test of Ray
This example uses placement_group API to spread work around
"""
import random
import os
import platform
import ray
import time
ray.init(ignore_reinit_error=True)
#ray.remote
class Actor():
def __init__(self, actor_id) -> None:
self.pid = os.getpid()
self.hostname = platform.node()
self.ip = ray._private.services.get_node_ip_address()
self.actor_id = actor_id
def ping(self):
print(f"{self.actor_id} {self.pid} {self.hostname} {self.ip} {time.time()} - ping")
time.sleep(random.randint(1,3))
return f"{self.actor_id}"
#ray.remote
def main():
# Get list of nodes to use
print(f"Found {len(actors)} Worker nodes in the Ray Cluster:")
# Setup one Actor per node
print(f"Setting up {len(actors)} Actors...")
time.sleep(1)
# Ping-Pong test
messages = [actors[a].ping.remote() for a in actors]
time.sleep(1)
for _ in range(20):
new_messages, messages = ray.wait(messages, num_returns=1)
for ray_message_id in new_messages:
pong = ray.get(ray_message_id)
print(pong, "- pong")
check = actors[pong].ping.remote()
time.sleep(1)
messages.append(check)
actors = {
"actor1" : Actor.remote(actor_id="actor1"),
"actor2" : Actor.remote(actor_id="actor2"),
"actor3" : Actor.remote(actor_id="actor3"),
"actor4" : Actor.remote(actor_id="actor4"),
"actor5" : Actor.remote(actor_id="actor5")
}
print(actors)
if __name__ == "__main__":
main.remote()
Related
Goal:
Accelerate the random walk generation by using multiple processes.
Get the list of vertices ids from which I want random walks to be generated in an input queue
Start as much processes as possible with the correct parameters
Make them put the random walks into an output queue
Wait for completion
Read the output queue
What I am doing:
# Libraries imports
from multiprocessing import cpu_count, Process, Queue
import queue
import configparser
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
from gremlin_python.process.anonymous_traversal import AnonymousTraversalSource, traversal
from gremlin_python.process.graph_traversal import __
# Function the processes are supposed to execute
def job(proc_id:int, siq:Queue, rwq:Queue, g:AnonymousTraversalSource, length:int):
while True:
try:
# Get next element in ids queue
start_id = siq.get_nowait()
except queue.Empty:
# If the ids queue is empty, then terminate
break
else:
# Do a random walk of length <length> from the vertex with id <start_id>
random_walk = g.V(start_id).repeat(
__.local(__.both().sample(1))
).times(length).path().next()
print(f"{proc_id}: rw obtained")
# Transform the list of vertices into a comma-separated string of ids
rwq.put(",".join(
[str(v.id) for v in random_walk]
))
print(f"{proc_id}: rw handled")
if __name__ == "__main__":
# Get the parameters from the <config.ini> configuration file
config = configparser.RawConfigParser()
config.read("config.ini")
jg_uri = config["JANUSGRAPH"]["URI"]
file_random_walks = config["FILES"]["RANDOM_WALKS"]
walks_nb_per_node = int(config["WALKS"]["NB_PER_NODE"])
walks_length = int(config["WALKS"]["LENGTH"])
# Connect to Janus Graph
connection = DriverRemoteConnection(jg_uri, "g")
g_main = traversal().withRemote(connection)
# Instantiate the queues and populate the ids one
start_ids_queue = Queue()
random_walks_queue = Queue()
for vertex in g_main.V().has("vertex_label", "<label>").fold().next():
start_ids_queue.put(vertex.id)
# Create and start the processes
nb_processes = cpu_count()
processes = []
for i in range(nb_processes):
p = Process(target=job, args=(
i,
start_ids_queue,
random_walks_queue,
g_main,
walks_length
))
processes.append(p)
p.start()
for p in processes:
p.join()
# Once the processes are terminated, read the random walks queue
random_walks = []
while not random_walks_queue.empty():
random_walks.append(random_walks_queue.get())
# Do something with the random walks
...
Issue:
Once the processes are started, nothing seems to happen. I never get the X: rw obtained/X: rw handled messages. With a bit more logging, I can see that the queries have been sent yet isn't finishing.
In the logs, when performing the first g_main.V().has("vertex_label", "<label>").fold().next() in the main process (when I populate the ids queue), I have the following message:
DEBUG:gremlinpython:submit with bytecode '[['V'], ['has', 'vertex_label', 'movie'], ['fold']]'
DEBUG:gremlinpython:message '[['V'], ['has', 'vertex_label', '<label>'], ['fold']]'
DEBUG:gremlinpython:processor='traversal', op='bytecode', args='{'gremlin': [['V'], ['has', 'vertex_label', '<label>'], ['fold']], 'aliases': {'g': 'g'}}'
DEBUG:asyncio:Using selector: EpollSelector
When the other processes send their queries, I have similar logs:
DEBUG:gremlinpython:submit with bytecode '[['V', 16456], ['repeat', [['local', [['both'], ['sample', 1]]]]], ['times', 10], ['path']]'
DEBUG:gremlinpython:message '[['V', 16456], ['repeat', [['local', [['both'], ['sample', 1]]]]], ['times', 10], ['path']]'
DEBUG:gremlinpython:processor='traversal', op='bytecode', args='{'gremlin': [['V', 16456], ['repeat', [['local', [['both'], ['sample', 1]]]]], ['times', 10], ['path']], 'aliases': {'g': 'g'}}'
DEBUG:asyncio:Using selector: EpollSelector
The issue seems not to reside in the query sent, but instead in the indefinite wait that ensues.
If you know of an issue with gremlinpython and multiprocessing, if there is a problem in my multi-processing code, or if you have any explanation that I may have overlooked, please explain to me! Thanks a lot to everyone reading this!
Solutions:
The first partial solution that I found is to use multi-threading instead of multiprocessing:
import configparser
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
from gremlin_python.process.anonymous_traversal import AnonymousTraversalSource, traversal
from gremlin_python.process.graph_traversal import __
import threading
class myThread(threading.Thread):
def __init__(self, thread_id, g, length):
threading.Thread.__init__(self)
self.thread_id = thread_id
self.thread_count = 0
self.gtraversal = g
self.walk_length = length
self.nb_walks = nb_walks
def run(self):
while True:
start_ids_list_lock.acquire()
try:
start_id = start_ids_list.pop(0)
start_ids_list_lock.release()
except IndexError:
start_ids_list_lock.release()
break
else:
self.thread_count += 1
random_walk = job(
vertex_id=start_id,
g=self.gtraversal,
length=self.walk_length,
nb_walks=self.nb_walks
)
random_walks_list_lock.acquire()
random_walks_list.append(random_walk)
random_walks_list_lock.release()
logging.info(f"Thread {self.thread_id}: {self.thread_count} done")
def job(vertex_id:int, g:AnonymousTraversalSource, length:int) -> str:
random_walk = g.V(vertex_id).repeat(
__.local(__.both().sample(1))
).times(length).path().next()
return ",".join(random_walk)
config = configparser.RawConfigParser()
config.read("config.ini")
jg_uri = config["JANUSGRAPH"]["URI"]
file_random_walks = config["FILES"]["RANDOM_WALKS"]
walks_length = int(config["WALKS"]["LENGTH"])
connection = DriverRemoteConnection(jg_uri, "g")
g_main = traversal().withRemote(connection)
threads = []
start_ids_list = []
random_walks_list = []
random_walks_list_lock = threading.Lock()
start_ids_list_lock = threading.Lock()
start_ids_list = [vertex.id for vertex in g_main.V().has("vertex_label", "<label>").fold().next()]
nb_vertices = len(start_ids_list)
nb_threads = 6
for i in range(nb_threads):
thread = myThread(
thread_id=i,
g=g_main,
length=walks_length
)
thread.start()
threads.append(thread)
for t in threads:
t.join()
# Do something with the random walks
...
This solution is effectively working and improves the execution time of the program. This isn't a full answer though, as it doesn't explain why the multiprocessing is not performing as I expected.
I'm trying to implement multithreading to a very time consuming program, and I've come across this SO answer:
https://stackoverflow.com/a/28463266/3451339, which basically offers this solution for multiple arrays:
from multiprocessing.dummy import Pool as ThreadPool
pool = ThreadPool(4)
results = pool.map(my_function, my_array)
# Close the pool and wait for the work to finish
pool.close()
pool.join()
and, passing multiple arrays:
results = pool.starmap(function, zip(list_a, list_b))
The following is the code I have so far which must be refactored with threading. It iterates over 4 arrays, and needs to pass arguments to the function at each iteration and append all results to a final container:
strategies = ['strategy_1', 'strategy_2']
budgets = [90,100,110,120,130,140,150,160]
formations=['343','352','433','442','451','532','541']
models = ['model_1', 'model_2', 'model_3']
all_teams = pd.DataFrame()
for strategy in strategies:
for budget in budgets:
for formation in formations:
for model in models:
team = function(strategy=strategy,
budget=budget,
curr_formation=formation,
model=model)
all_teams = all_teams.append(team, ignore_index=True, sort=False)\
.reset_index(drop=True)\
.copy()
Note: Each function call makes api web requests.
What is the way to go with multithreading in this scenario?
Python has the multiprocessing module which can run multiple tasks in parallel and inside each process you can have multiple threads or async io code
Here is a working example which uses 3 Processes and Multithreading
import pandas as pd
import multiprocessing
from multiprocessing import Queue
from threading import Thread
strategies = ['strategy_1', 'strategy_2']
budgets = [90,100,110,120,130,140,150,160]
formations=['343','352','433','442','451','532','541']
models = ['model_1', 'model_2', 'model_3']
#shared Queue if you want to reduce write locking use 3 Queues
Q = Queue()
# Retrive async if you want to speed up the process
def function(q,strategy,budget,curr_formation,model):
q.put("Team")
def runTask(model,q):
for strategy in strategies:
for budget in budgets:
for formation in formations:
Thread(target=function,args=(q,strategy,budget,formation,model)).start()
def main():
p1 = multiprocessing.Process(target=runTask, args=('model_1',Q))
p2 = multiprocessing.Process(target=runTask, args=('model_2',Q))
p3 = multiprocessing.Process(target=runTask, args=('model_3',Q))
p1.start()
p2.start()
p3.start()
p1.join()
p2.join()
p3.join()
all = []
for i in range(0,Q.qsize()):
all.append(Q.get())
print(all)
print(len(all))
if __name__ == "__main__":
main()
A usefull article Multiprocessing in Python | Set 2
This can be one approach.
Note: Thread vs multiProcess. In this SO, I have provided execution through map, that will not work here as map has limitation on number.
Run your nested for loops and build a list of parameters ==> financial_options
for strategy in strategies:
for budget in budgets:
for formation in formations:
for model in models:
financial_options.append([strategy,budget,formation,model])
financial_options_len=len(financial_options)
Create a new function that will handle API calls
def access_url(url,parameter_list):
#response=requests.get(url) # request goes here
print(parameter_list)
time.sleep(2)
print("sleep done!")
return "Hello"#,parameter_list # return type
now run the threading with these permutation parameters. so complete program will look like this:
import concurrent.futures
import requests # just in case needed
from bs4 import BeautifulSoup # just in case needed
import time
import pandas as pd
def access_url(url,parameter_list):
#response=requests.get(url) # request goes here
print(parameter_list)
time.sleep(2)
print("sleep done!")
return "Hello"#,parameter_list # return type
def multi_threading():
test_url="http://bla bla.com/"
base_url=test_url
THREAD_MULTI_PROCESSING= True
strategies = ['strategy_1', 'strategy_2']
budgets = [90,100,110,120,130,140,150,160]
formations=['343','352','433','442','451','532','541']
models = ['model_1', 'model_2', 'model_3']
all_teams = pd.DataFrame()
start = time.perf_counter() # start time for performance
financial_options=[]
decision_results=[]
for strategy in strategies:
for budget in budgets:
for formation in formations:
for model in models:
financial_options.append([strategy,budget,formation,model])
financial_options_len=len(financial_options)
print(f"Total options:{financial_options_len}")
future_list = []
THREAD_MULTI_PROCESSING_LOOP=True
if THREAD_MULTI_PROCESSING_LOOP:
with concurrent.futures.ThreadPoolExecutor() as executor: # Through executor
for each in range(financial_options_len):
future = executor.submit(access_url,test_url,financial_options[each]) # submit each option
future_list.append(future)
for f1 in concurrent.futures.as_completed(future_list):
r1=f1.result()
decision_results.append(r1)
end = time.perf_counter() # finish time for performance
print(f'Threads: Finished in {round(end - start,2)} second(s)')
df=pd.DataFrame(decision_results)
df.to_csv("multithread_for.csv")
return df,decision_results
df,results=multi_threading()
Let's say we have a concurrent SMACH container sm_con which includes two state machines SM1 and SM2. I need to find a way for SM1 to continuously update some data and for SM2 to access (and eventually also modify) the same data. I thought about solving this by passing the userdata of sm_con to SM1 and SM2 as input- and output-keys hoping that if SM1 modifies the data it would automatically overwrite sm_cons userdata (kind of like working with pointers in c++) but this doesn't work.
The corresponding code would look like this:
import smach
import smach_ros
import rospy
class st1(smach.State):
def __init__(self, outcomes=['successful', 'preempted']):
smach.State.__init__(self, outcomes, input_keys=['in_test'], output_keys=['out_test'])
def execute(self, userdata):
if self.preempt_requested():
self.service_preempt()
return 'preempted'
rospy.logerr('test 1: '+str(userdata.in_test))
userdata.out_test=userdata.in_test+1
return 'successful'
class st2(smach.State):
def __init__(self, outcomes=['successful', 'preempted']):
smach.State.__init__(self, outcomes, input_keys=['in_test'])
def execute(self, userdata):
#time.sleep(2)
if self.preempt_requested():
self.service_preempt()
return 'preempted'
rospy.logerr('test 2: ' + str(userdata.in_test))
return 'successful'
if __name__=="__main__":
rospy.init_node('test_state_machine')
sm_con = smach.Concurrence(outcomes=['success'],
default_outcome='success'
)
sm_con.userdata.testdata = 0
with sm_con:
sm_1 = smach.StateMachine(outcomes=['success', 'preempted'], input_keys=['testdata'], output_keys=['testdata'])
with sm_1:
smach.StateMachine.add('ST1', st1(),
remapping={'in_test': 'testdata', 'out_test': 'testdata'},
transitions={'successful': 'ST1'})
sm_2 = smach.StateMachine(outcomes=['success', 'preempted'], input_keys=['testdata'])
with sm_2:
smach.StateMachine.add('ST2', st2(),
remapping={'in_test':'testdata'},
transitions={'successful': 'ST2'})
smach.Concurrence.add('SM1', sm_1)
smach.Concurrence.add('SM2', sm_2)
# Execute SMACH plan
outcome = sm_con.execute()
print('exit-outcome:' + outcome)
# Wait for ctrl-c to stop the application
rospy.spin()
Running this code, the output 'test 1: ...' shows that inside SM1 the userdata gets incremented while the output 'test 2: ...' shows that SM2 doesn't access the incremented data as the output remains 0.
How can I modify some data in SM1 and access the modified data in SM2?
I found a workaround for this using mutable objects like described here.
Applied on the code above, it would look like the following:
import smach
import smach_ros
import rospy
class st1(smach.State):
def __init__(self, outcomes=['successful', 'preempted']):
smach.State.__init__(self, outcomes, input_keys=['in_test'])
def execute(self, userdata):
if self.preempt_requested():
self.service_preempt()
return 'preempted'
rospy.logerr('test 1: '+str(userdata.in_test))
userdata.in_test[0]=userdata.in_test[0]+1
return 'successful'
class st2(smach.State):
def __init__(self, outcomes=['successful', 'preempted']):
smach.State.__init__(self, outcomes, input_keys=['in_test'])
def execute(self, userdata):
#time.sleep(2)
if self.preempt_requested():
self.service_preempt()
return 'preempted'
rospy.logerr('test 2: ' + str(userdata.in_test[0]))
return 'successful'
if __name__=="__main__":
rospy.init_node('test_state_machine')
sm_con = smach.Concurrence(outcomes=['success'],
default_outcome='success'
)
sm_con.userdata.testdata = [0]
with sm_con:
sm_1 = smach.StateMachine(outcomes=['success', 'preempted'], input_keys=['testdata'])
with sm_1:
smach.StateMachine.add('ST1', st1(),
remapping={'in_test': 'testdata'},
transitions={'successful': 'ST1'})
sm_2 = smach.StateMachine(outcomes=['success', 'preempted'], input_keys=['testdata'])
with sm_2:
smach.StateMachine.add('ST2', st2(),
remapping={'in_test':'testdata'},
transitions={'successful': 'ST2'})
smach.Concurrence.add('SM1', sm_1)
smach.Concurrence.add('SM2', sm_2)
# Execute SMACH plan
outcome = sm_con.execute()
print('exit-outcome:' + outcome)
# Wait for ctrl-c to stop the application
rospy.spin()
Since this is only a workaround, refer to my corresponding issue-post here for more information.
After running some compuations nicely in linear fashion with a moderator script (cf. below) calling an inner one performing the computation, I struggle
to bring it to execution when trying it with multiprocessing. It seems that each CPU core is running through this list set (testRegister) and launches a computation even if an other core already performed this task earlier (in the same session). How can I prevent this chaotic behaviour? It is my first time attempting calling multiple processors by Python.
Correction: The initial post did not show that the test is a string consisting calling "the inner script" with varying parameters m1 and m2 beside fixed arguments arg1 and arg2 belonging solely to this "inner script".
#!/usr/bin/env python3
import os
import subprocess as sub
import sys
import multiprocessing
fileRegister = []
testRegister = []
def fileCollector():
for file in os.listdir("."):
if file.endswith(".xyz"):
fileRegister.append(file)
fileRegister.sort()
return fileRegister
def testSetup():
data = fileRegister
while len(data) > 1:
for entry in fileRegister[1:]:
m0 = str(fileRegister[0])
m1 = str(entry)
test = str("python foo.py ") + str(m1) + str(" ") + str(m2) +\
str(" --arg1 --arg2") # formulate test condition
testRegister.append(test)
testRegister.sort()
del data[0]
return testRegister
def shortAnalysator():
for entry in testRegister:
print(str(entry))
sub.call(entry, shell=True)
del testRegister[0]
def polyAnalysator():
# apparently each CPU core works as if the register were not shared
# reference: https://docs.python.org/3.7/library/multiprocessing.html
if __name__ == '__main__':
jobs = []
for i in range(3): # safety marging to not consume all CPU
p = multiprocessing.Process(target=shortAnalysator)
jobs.append(p)
p.start()
fileCollector()
testSetup()
shortAnalysator() # proceeding expectably on one CPU (slow)
# polyAnalysator() # causing irritation
sys.exit()```
Your polyAnalysator is running the shortAnalysator three times. Try changing your polyAnalysator as follows, and add the f method. This uses the multiprocessing Pool:
from multiprocessing import Pool
def f(test):
sub.call(test, shell=True)
def polyAnalysator():
# apparently each CPU core works as if the register were not shared
# reference: https://docs.python.org/3.7/library/multiprocessing.html
with Pool(3) as p:
p.map(f, testRegister)
I have my code that is sprawning multiple processes to check the count of files and maintaining the records in the database. The code which is working is mentioned below :
import multiprocessing as mp
from multiprocessing import Pool
import os
import time
import mysql.connector
"""Function to check the count of the file"""
def file_wc(fname):
with open('/home/vaibhav/Desktop/Input_python/'+ fname) as f:
count = sum(1 for line in f)
return (fname,count)
class file_audit:
def __init__(self):
"""Initialising the constructor for getting the names of files
and refrencing the outside class function"""
folder = '/home/vaibhav/Desktop/Input_python'
self.fnames = (name for name in os.listdir(folder))
self.file_wc=file_wc
def count_check(self):
"Creating 4 worker threads to check the count of the file parallelly"
pool = Pool(4)
self.m=list(pool.map(self.file_wc, list(self.fnames),4))
pool.close()
pool.join()
def database_updation(self):
"""To maintain an entry in the database with details
like filename and recrods present in the file"""
self.db = mysql.connector.connect(host="localhost",user="root",password="root",database="python_showtime" )
# prepare a cursor object using cursor() method
self.cursor = self.db.cursor()
query_string = ("INSERT INTO python_showtime.audit_capture"
"(name,records)"
"VALUES(%s,%s)")
#data_user = (name,records)
for each in self.m:
self.cursor.execute(query_string, each)
self.db.commit()
self.cursor.close()
start_time = time.time()
print("My program took", time.time() - start_time, "to run")
#if __name__ == '__main__':
x=file_audit()
x.count_check() #To check the count by sprawning multiple processes
x.database_updation() #To maintain the entry in the database
Point to be considered
Now if i put my function inside the class and comment self.file_wc=file_wc in the constructor section i get the Error can't pickle on generator objects. I got some fair understanding like we cannot pickle some objects,So want to know what exactly is happening at the background in very simple terms. I got the reference from here or here to make the code working