Goal:
Accelerate the random walk generation by using multiple processes.
Get the list of vertices ids from which I want random walks to be generated in an input queue
Start as much processes as possible with the correct parameters
Make them put the random walks into an output queue
Wait for completion
Read the output queue
What I am doing:
# Libraries imports
from multiprocessing import cpu_count, Process, Queue
import queue
import configparser
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
from gremlin_python.process.anonymous_traversal import AnonymousTraversalSource, traversal
from gremlin_python.process.graph_traversal import __
# Function the processes are supposed to execute
def job(proc_id:int, siq:Queue, rwq:Queue, g:AnonymousTraversalSource, length:int):
while True:
try:
# Get next element in ids queue
start_id = siq.get_nowait()
except queue.Empty:
# If the ids queue is empty, then terminate
break
else:
# Do a random walk of length <length> from the vertex with id <start_id>
random_walk = g.V(start_id).repeat(
__.local(__.both().sample(1))
).times(length).path().next()
print(f"{proc_id}: rw obtained")
# Transform the list of vertices into a comma-separated string of ids
rwq.put(",".join(
[str(v.id) for v in random_walk]
))
print(f"{proc_id}: rw handled")
if __name__ == "__main__":
# Get the parameters from the <config.ini> configuration file
config = configparser.RawConfigParser()
config.read("config.ini")
jg_uri = config["JANUSGRAPH"]["URI"]
file_random_walks = config["FILES"]["RANDOM_WALKS"]
walks_nb_per_node = int(config["WALKS"]["NB_PER_NODE"])
walks_length = int(config["WALKS"]["LENGTH"])
# Connect to Janus Graph
connection = DriverRemoteConnection(jg_uri, "g")
g_main = traversal().withRemote(connection)
# Instantiate the queues and populate the ids one
start_ids_queue = Queue()
random_walks_queue = Queue()
for vertex in g_main.V().has("vertex_label", "<label>").fold().next():
start_ids_queue.put(vertex.id)
# Create and start the processes
nb_processes = cpu_count()
processes = []
for i in range(nb_processes):
p = Process(target=job, args=(
i,
start_ids_queue,
random_walks_queue,
g_main,
walks_length
))
processes.append(p)
p.start()
for p in processes:
p.join()
# Once the processes are terminated, read the random walks queue
random_walks = []
while not random_walks_queue.empty():
random_walks.append(random_walks_queue.get())
# Do something with the random walks
...
Issue:
Once the processes are started, nothing seems to happen. I never get the X: rw obtained/X: rw handled messages. With a bit more logging, I can see that the queries have been sent yet isn't finishing.
In the logs, when performing the first g_main.V().has("vertex_label", "<label>").fold().next() in the main process (when I populate the ids queue), I have the following message:
DEBUG:gremlinpython:submit with bytecode '[['V'], ['has', 'vertex_label', 'movie'], ['fold']]'
DEBUG:gremlinpython:message '[['V'], ['has', 'vertex_label', '<label>'], ['fold']]'
DEBUG:gremlinpython:processor='traversal', op='bytecode', args='{'gremlin': [['V'], ['has', 'vertex_label', '<label>'], ['fold']], 'aliases': {'g': 'g'}}'
DEBUG:asyncio:Using selector: EpollSelector
When the other processes send their queries, I have similar logs:
DEBUG:gremlinpython:submit with bytecode '[['V', 16456], ['repeat', [['local', [['both'], ['sample', 1]]]]], ['times', 10], ['path']]'
DEBUG:gremlinpython:message '[['V', 16456], ['repeat', [['local', [['both'], ['sample', 1]]]]], ['times', 10], ['path']]'
DEBUG:gremlinpython:processor='traversal', op='bytecode', args='{'gremlin': [['V', 16456], ['repeat', [['local', [['both'], ['sample', 1]]]]], ['times', 10], ['path']], 'aliases': {'g': 'g'}}'
DEBUG:asyncio:Using selector: EpollSelector
The issue seems not to reside in the query sent, but instead in the indefinite wait that ensues.
If you know of an issue with gremlinpython and multiprocessing, if there is a problem in my multi-processing code, or if you have any explanation that I may have overlooked, please explain to me! Thanks a lot to everyone reading this!
Solutions:
The first partial solution that I found is to use multi-threading instead of multiprocessing:
import configparser
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
from gremlin_python.process.anonymous_traversal import AnonymousTraversalSource, traversal
from gremlin_python.process.graph_traversal import __
import threading
class myThread(threading.Thread):
def __init__(self, thread_id, g, length):
threading.Thread.__init__(self)
self.thread_id = thread_id
self.thread_count = 0
self.gtraversal = g
self.walk_length = length
self.nb_walks = nb_walks
def run(self):
while True:
start_ids_list_lock.acquire()
try:
start_id = start_ids_list.pop(0)
start_ids_list_lock.release()
except IndexError:
start_ids_list_lock.release()
break
else:
self.thread_count += 1
random_walk = job(
vertex_id=start_id,
g=self.gtraversal,
length=self.walk_length,
nb_walks=self.nb_walks
)
random_walks_list_lock.acquire()
random_walks_list.append(random_walk)
random_walks_list_lock.release()
logging.info(f"Thread {self.thread_id}: {self.thread_count} done")
def job(vertex_id:int, g:AnonymousTraversalSource, length:int) -> str:
random_walk = g.V(vertex_id).repeat(
__.local(__.both().sample(1))
).times(length).path().next()
return ",".join(random_walk)
config = configparser.RawConfigParser()
config.read("config.ini")
jg_uri = config["JANUSGRAPH"]["URI"]
file_random_walks = config["FILES"]["RANDOM_WALKS"]
walks_length = int(config["WALKS"]["LENGTH"])
connection = DriverRemoteConnection(jg_uri, "g")
g_main = traversal().withRemote(connection)
threads = []
start_ids_list = []
random_walks_list = []
random_walks_list_lock = threading.Lock()
start_ids_list_lock = threading.Lock()
start_ids_list = [vertex.id for vertex in g_main.V().has("vertex_label", "<label>").fold().next()]
nb_vertices = len(start_ids_list)
nb_threads = 6
for i in range(nb_threads):
thread = myThread(
thread_id=i,
g=g_main,
length=walks_length
)
thread.start()
threads.append(thread)
for t in threads:
t.join()
# Do something with the random walks
...
This solution is effectively working and improves the execution time of the program. This isn't a full answer though, as it doesn't explain why the multiprocessing is not performing as I expected.
Related
I see a lot of tutorials on how to use queues, but they always show them implemented in the same file. I'm trying to organize my code files well from the beginning because I anticipate the project to become very large. How do I get the queue that I initialize in my main file to import into the other function files?
Here is my main file:
import multiprocessing
import queue
from data_handler import data_handler
from get_info import get_memory_info
from get_info import get_cpu_info
if __name__ == '__main__':
q = queue.Queue()
getDataHandlerProcess = multiprocessing.Process(target=data_handler(q))
getMemoryInfoProcess = multiprocessing.Process(target=get_memory_info(q))
getCPUInfoProcess = multiprocessing.Process(target=get_cpu_info(q))
getDataHandlerProcess.start()
getMemoryInfoProcess.start()
getCPUInfoProcess.start()
print("DEBUG: All tasks successfully started.")
Here is my producer:
import psutil
import struct
import time
from data_frame import build_frame
def get_cpu_info(q):
while True:
cpu_string_data = bytes('', 'utf-8')
cpu_times = psutil.cpu_percent(interval=0.0, percpu=True)
for item in cpu_times:
cpu_string_data = cpu_string_data + struct.pack('<d',item)
cpu_frame = build_frame(cpu_string_data, 0, 0, -1, -1)
q.put(cpu_frame)
print(cpu_frame)
time.sleep(1.000)
def get_memory_info(q):
while True:
memory_string_data = bytes('', 'utf-8')
virtual_memory = psutil.virtual_memory()
swap_memory = psutil.swap_memory()
memory_info = list(virtual_memory+swap_memory)
for item in memory_info:
memory_string_data = memory_string_data + struct.pack('<d',item)
memory_frame = build_frame(memory_string_data, 0, 1, -1, -1)
q.put(memory_frame)
print(memory_frame)
time.sleep(1.000)
def get_disk_info(q):
while True:
disk_usage = psutil.disk_usage("/")
disk_io_counters = psutil.disk_io_counters()
time.sleep(1.000)
print(disk_usage)
print(disk_io_counters)
def get_network_info(q):
while True:
net_io_counters = psutil.net_io_counters()
time.sleep(1.000)
print(net_io_counters)
And here is my consumer:
def data_handler(q):
while True:
next_element = q.get()
print(next_element)
print('Item received at data handler queue.')
It is not entirely clear to me what do you mean by " How do I get the queue that I initialize in my main file to import into the other function files?".
Normally you pass a queue as and argument to a function and use it within a function scope regardless of the file structure. Or perform any other variable sharing techniques used for any other data type.
Your code seems to have a few errors however. Firstly, you shouldn't be using queue.Queue with multiprocessing. It has it's own version of that class.
q = multiprocessing.Queue()
It is slower than the queue.Queue, but it works for sharing the data across processes.
Secondly, the proper way to create process objects is:
getDataHandlerProcess = multiprocessing.Process(target=data_handler, args = (q,))
Otherwise you are actually calling data_handler(q) the main thread and trying to assign its return value to the target argument of multiprocessing.Process. Your data_handler function never returns, so the program probably gets into an infinite a deadlock at this point before multiprocessing even begins. Edit: actually it probably goes into infinite wait trying to get an element from an empty queue which will never be filled.
I'm using ProcessPoolExecutor context manager to run several Kafka consumers in parallel. I need to store the process IDs of the child processes so that later, I can cleanly terminate those processes. I have such code:
Class MultiProcessConsumer:
...
def run_in_parallel(self):
parallelism_factor = 5
with ProcessPoolExecutor() as executor:
processes = [executor.submit(self.consume) for _ in range(parallelism_factor)]
# It would be nice If I could write [process.pid for process in processes] to a file here.
def consume(self):
while True:
for message in self.kafka_consumer:
do_stuff(message)
I know I can use os.get_pid() in the consume method to get PIDs. But, handling them properly (in case of constant shutting down or starting up of consumers) requires some extra work.
How would you propose that I get and store PIDs of the child processes in such a context?
os.get_pid() seems to be the way to go. Just pass them through a Queue or Pipe in combination with maybe some random UUID that you pass to the process before to identify the PID.
from concurrent.futures import ProcessPoolExecutor
import os
import time
import uuid
#from multiprocessing import Process, Queue
import multiprocessing
import queue
#The Empty exception in in Queue, multiprocessing borrows
#it from there
# https://stackoverflow.com/questions/9908781/sharing-a-result-queue-among-several-processes
m = multiprocessing.Manager()
q = m.Queue()
def task(n, queue, uuid):
my_pid = os.getpid()
print("Executing our Task on Process {}".format(my_pid))
queue.put((uuid, my_pid))
time.sleep(n)
return n * n
def main():
with ProcessPoolExecutor(max_workers = 3) as executor:
some_dict = {}
for i in range(10):
print(i)
u = uuid.uuid4()
f = executor.submit(task, i, q, u)
some_dict[u] = [f, None] # PID not known here
try:
rcv_uuid, rcv_pid = q.get(block=True, timeout=1)
some_dict[rcv_uuid][1] = rcv_pid # store PID
except queue.Empty as e:
print('handle me', e)
print('I am', rcv_uuid, 'and my PID is', rcv_pid)
if __name__ == '__main__':
main()
Although this field is private, you could use the field in PoolProcessExecutor self._processes. The code snippet below shows how to use this variable.
import os
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import wait
nb_processes = 100
executor = ProcessPoolExecutor(nb_processes )
futures = [executor.submit(os.getpid) for _ in range(nb_processes )]
wait(futures)
backends = list(map(lambda x: x.result(), futures))
assert len(set(backends)) == nb_processes
In the case above, an assertion error is raised. This is because a new task can reuse the forked processes in the pool. You cannot know all forked process IDs through the method you memtioned. Hence, you can do as:
import os
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import wait
nb_processes = 100
executor = ProcessPoolExecutor(nb_processes )
futures = [executor.submit(os.getpid) for _ in range(nb_processes )]
wait(futures)
backends = list(map(lambda x: x.result(), futures))
assert len(set(executor._processes.keys())) == nb_processes
print('all of PID are: %s.' % list(executor._processes.keys()))
If you don't want to destroy the encapsulation, you could inhert the ProcessPoolExecutor and create a new property for that.
I am having a hard time working with Python multiprocessing module.
In a nutshell, I have a dictionary object which updates, say, occurrences of a string from lots of s3 files. The key for the dictionary is the occurrence I need which increments by 1 each time it is found.
Sample code:
import boto3
from multiprocessing import Process, Manager
import simplejson
client = boto3.client('s3')
occurences_to_find = ["occ1", "occ2", "occ3"]
list_contents = []
def getS3Key(prefix_name, occurence_dict):
kwargs = {'Bucket': "bucket_name", 'Prefix': "prefix_name"}
while True:
value = client.list_objects_v2(**kwargs)
try:
contents = value['Contents']
for obj in contents:
key=obj['Key']
yield key
try:
kwargs['ContinuationToken'] = value['NextContinuationToken']
except KeyError:
break
except KeyError:
break
def getS3Object(s3_key, occurence_dict):
object = client.get_object(Bucket=bucket_name, Key=s3_key)
objjects = object['Body'].read()
for object in objects:
object_json = simplejson.loads(activity)
msg = activity_json["msg"]
for occrence in occurence_dict:
if occrence in msg:
occurence_dict[str(occrence)] += 1
break
'''each process will hit this function'''
def doWork(prefix_name_list, occurence_dict):
for prefix_name in prefix_name_list:
for s3_key in getS3Key(prefix_name, occurence_dict):
getS3Object(s3_key, occurence_dict)
def main():
manager = Manager()
'''shared dictionary between processes'''
occurence_dict = manager.dict()
procs = []
s3_prefixes = [["prefix1"], ["prefix2"], ["prefix3"], ["prefix4"]]
for occurrence in occurences_to_find:
occurence_dict[occurrence] = 0
for index,prefix_name_list in enumerate(s3_prefixes):
proc = Process(target=doWork, args=(prefix_name_list, occurence_dict))
procs.append(proc)
for proc in procs:
proc.start()
for proc in procs:
proc.join()
print(occurence_dict)
main()
I am having issues with speed of the code as it takes hours for the code to run with more than 10000 s3 prefixes and keys. I think the manager dictionary is shared and is locked by each process, and hence, it is not being updated concurrently; rather one process waits for it to be "released".
How can I update the dictionary parallely? Or, how can I maintain multiple dicts for each process and then combine the result in the end?
I use a script to parce some sites and get news from there.
Each function in this script parse one site and return list of articles and then I want to combine them all in one big list.
If I parce site by site it takes to long and I desided to use multithreading.
I found a sample like this one in the bottom, but it seems not pithonic for me.
If I will add one more function to parse one more site, I will need to add each time the same block of code:
qN = Queue()
Thread(target=wrapper, args=(last_news_from_bar, qN)).start()
news_from_N = qN.get()
for new in news_from_N:
news.append(new)
Is there another solution to do this kind of stuff?
#!/usr/bin/python
# -*- coding: utf-8 -*-
from queue import Queue
from threading import Thread
def wrapper(func, queue):
queue.put(func())
def last_news_from_bar():
...
return list_of_articles #[['title1', 'http://someurl1', '2017-09-13'],['title2', 'http://someurl2', '2017-09-13']]
def last_news_from_foo():
...
return list_of_articles
q1, q2 = Queue(), Queue()
Thread(target=wrapper, args=(last_news_from_bar, q1)).start()
Thread(target=wrapper, args=(last_news_from_foo, q2)).start()
news_from_bar = q1.get()
news_from_foo = q2.get()
all_news = []
for new in news_from_bar:
news.append(new)
for new in news_from_foo:
news.append(new)
print(all_news)
Solution without Queue:
NEWS = []
LOCK = Lock()
def gather_news(url):
while True:
news = news_from(url)
if not news: break
with LOCK:
NEWS.append(news)
if __name__ == '__main__':
T = []
for url in ['url1', 'url2', 'url3']:
t = Thread(target=gather_news, args=(url,))
t.start()
T.append(t)
# Wait until all Threads done
for t in T:
t.join()
print(NEWS)
All, that you should do, is using a single queue and extend your result array:
q1 = Queue()
Thread(target=wrapper, args=(last_news_from_bar, q1)).start()
Thread(target=wrapper, args=(last_news_from_foo, q1)).start()
all_news = []
all_news.extend(q1.get())
all_news.extend(q1.get())
print(all_news)
After some lookup into google and posts of stackoverflow and other sites, i'm still confused on how i can apply a queue and threading on my code:
import psycopg2
import sys
import re
# for threading and queue
import multiprocessing
from multiprocessing import Queue
# for threading and queue
import time
from datetime import datetime
class Database_connection():
def db_call(self,query,dbHost,dbName,dbUser,dbPass):
try:
con = None
con = psycopg2.connect(host=dbHost,database=dbName,
user=dbUser,password=dbPass)
cur = con.cursor()
cur.execute(query)
data = cur.fetchall()
resultList = []
for data_out in data:
resultList.append(data_out)
return resultList
except psycopg2.DatabaseError, e:
print 'Error %s' % e
sys.exit(1)
finally:
if con:
con.close()
w = Database_connection()
sql = "select stars from galaxy"
startTime = datetime.now()
for result in w.db_call(sql, "x", "x", "x", "x"):
print result[0]
print "Runtime: " + str(datetime.now()-startTime)
lets supose the result will be 100+ values. How can i, put those 100+ results on queue and execute ( print, for example ) then 5 at time using queue and multiprocessing module?
What do you want this code to do?
You get no output from this code because get() returns the next item from the queue (doc). You are putting the letters from the sql response into the queue one letter at a time. the i in for i... is looping over the list returned by w.db_call. Those items are (I assume) strings, which you are then iterating over and adding one at a time to the queue. The next thing you do is to remove the element you just added to the queue from the queue, which leaves the queue unchanged over each pass through the loop. If you put a print statement in the loop it prints out the letter it just got from the queue.
Queues are used to pass information between processes. I think you are trying to set-up a producer/consumer pattern where you have one process add things to the queue and multiple other processes which consume things from the queue. See working example of multiprocessing.Queue and links contained there in (example, main documentation).
Probably the simplest way to get this working, as long as you don't need it to run in an interactive shell, is to use Pool (lifted almost verbatim from the documentation of multiprocess)
from multiprocessing import Pool
p = Pool(5) # sets the number of worker threads you want
def f(res):
# put what ever you want to do with each of the query results in here
return res
result_lst = w.db_call(sql, "x", "x", "x", "x")
proced_results = p.map(f, result_lst)
which apply what ever you want to do to each result (written into the function f) and returns the results of that manipulation as a list. The number of sub-processes to use is set by the argument to Pool.
This is my suggestion...
import Queue
from threading import Thread
class Database_connection:
def db_call(self,query,dbHost,dbName,dbUser,dbPass):
# your code here
return
# in this example each thread will execute this function
def processFtpAddrMt(queue):
# loop will continue until queue containing FTP addresses is empty
while True:
# get an ftp address, a exception will be called when the
# queue is empty and the loop will break
try: ftp_addr = queue.get()
except: break
# put code to process the ftp address here
# let queue know this task is done
queue.task_done()
w = Database_connection()
sql = "select stars from galaxy"
ftp_addresses = w.db_call(sql, "x", "x", "x", "x")
# put each result of the SQL call in a Queue class
ftp_addr_queue = Queue.Queue()
for addr in ftp_addresses:
ftp_addr_queue.put(addr)
# create five threads where each one will run analyzeFtpResult
# pass the queue to the analyzeFtpResult function
for x in range(0,5):
t = Thread(target=processFtpAddrMt,args=(ftp_addr_queue,))
t.setDaemon(True)
t.start()
# blocks further execution of the script until all queue items have been processed
ftp_addr_queue.join()
It uses the Queue class to store your SQL results and then the Thread class to process the queue. Five thread classes are created and each one uses a processFtpAddrMt function which take ftp addresses from the queue until the queue is empty. All you have to do is add the code for processing the ftp address. Hope this helps.
I was able to solve the problem with the following:
def worker():
w = Database_connection()
sql = "select stars from galaxy"
for result in w.db_call(sql, "x", "x", "x", "x"):
if result:
jobs = []
startTime = datetime.now()
for i in range(1):
p = multiprocessing.Process(target=worker)
jobs.append(p)
p.start()
print "Runtime: " + str(datetime.now()-startTime)
I belive it is not the best way to do it, but for now solved my problem :)