My app iterarting through the chunks of hosts and i am getting some data for this hosts, processing and updating some defaultdicts.
I am trying to update defaultdicts from Inner function using treading and Lock.
Here is some example of what i am doing:
from threading import Lock
from concurrent.futures import ThreadPoolExecutor
from collections import defaultdict
THREAD_LOCK = Lock()
def collect_data(self):
data_1 = defaultdict(set)
data_2 = defaultdict(list)
def update_data(id_to_ips, risks):
for id, ips in id_to_ips.items():
data_1[id].update(ips)
for ip, risk in risks.items():
data_2[risk].append(ip)
if self.thread_count:
all_hosts = self._get_all_host()
with ThreadPoolExecutor(max_workers=self.thread_count) as executor:
func = lambda x: list(self._get_data(x))
for results in executor.map(func, all_hosts):
for id_to_ips_dict, risks_dict in results:
with THREAD_LOCK:
update_data(id_to_ips_dict, risks_dict)
else:
for host_ids in self._get_all_host_ids():
id_to_ips_dict, risks_dict = self._get_data(host_ids)
update_data(id_to_ips_dict, risks_dict)
return data_1, data_2
I know that updating defaultdict with THREAD LOCK is okay, but updating it from Inner function confuses me a bit.
What do you think?
Any ideas or suggestions?
Thanks.
Related
I want to define functions getting their name from a dataframe and run all at the same time.
I tried but functions are not starting:
import pandas as pd
import threading
import time
import inspect
df = pd.read_csv('../cities-list.csv')
for i in df["cities"]:
define_func = f"""
def task_{i}():
for i in range(0,5):
time.sleep(i)
print(i)
"""
define_func = inspect.cleandoc(define_func)
exec(define_func)
Thread = threading.Thread(target=exec(define_func))
Thread.start()
Thread.join()
Looks like you want to process each city in a separate thread in which case:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
df = pd.read_csv('../cities-list.csv')
def process(city):
print(f'Processing {city}')
with ThreadPoolExecutor() as executor:
executor.map(process, df['cities'])
Goal:
Accelerate the random walk generation by using multiple processes.
Get the list of vertices ids from which I want random walks to be generated in an input queue
Start as much processes as possible with the correct parameters
Make them put the random walks into an output queue
Wait for completion
Read the output queue
What I am doing:
# Libraries imports
from multiprocessing import cpu_count, Process, Queue
import queue
import configparser
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
from gremlin_python.process.anonymous_traversal import AnonymousTraversalSource, traversal
from gremlin_python.process.graph_traversal import __
# Function the processes are supposed to execute
def job(proc_id:int, siq:Queue, rwq:Queue, g:AnonymousTraversalSource, length:int):
while True:
try:
# Get next element in ids queue
start_id = siq.get_nowait()
except queue.Empty:
# If the ids queue is empty, then terminate
break
else:
# Do a random walk of length <length> from the vertex with id <start_id>
random_walk = g.V(start_id).repeat(
__.local(__.both().sample(1))
).times(length).path().next()
print(f"{proc_id}: rw obtained")
# Transform the list of vertices into a comma-separated string of ids
rwq.put(",".join(
[str(v.id) for v in random_walk]
))
print(f"{proc_id}: rw handled")
if __name__ == "__main__":
# Get the parameters from the <config.ini> configuration file
config = configparser.RawConfigParser()
config.read("config.ini")
jg_uri = config["JANUSGRAPH"]["URI"]
file_random_walks = config["FILES"]["RANDOM_WALKS"]
walks_nb_per_node = int(config["WALKS"]["NB_PER_NODE"])
walks_length = int(config["WALKS"]["LENGTH"])
# Connect to Janus Graph
connection = DriverRemoteConnection(jg_uri, "g")
g_main = traversal().withRemote(connection)
# Instantiate the queues and populate the ids one
start_ids_queue = Queue()
random_walks_queue = Queue()
for vertex in g_main.V().has("vertex_label", "<label>").fold().next():
start_ids_queue.put(vertex.id)
# Create and start the processes
nb_processes = cpu_count()
processes = []
for i in range(nb_processes):
p = Process(target=job, args=(
i,
start_ids_queue,
random_walks_queue,
g_main,
walks_length
))
processes.append(p)
p.start()
for p in processes:
p.join()
# Once the processes are terminated, read the random walks queue
random_walks = []
while not random_walks_queue.empty():
random_walks.append(random_walks_queue.get())
# Do something with the random walks
...
Issue:
Once the processes are started, nothing seems to happen. I never get the X: rw obtained/X: rw handled messages. With a bit more logging, I can see that the queries have been sent yet isn't finishing.
In the logs, when performing the first g_main.V().has("vertex_label", "<label>").fold().next() in the main process (when I populate the ids queue), I have the following message:
DEBUG:gremlinpython:submit with bytecode '[['V'], ['has', 'vertex_label', 'movie'], ['fold']]'
DEBUG:gremlinpython:message '[['V'], ['has', 'vertex_label', '<label>'], ['fold']]'
DEBUG:gremlinpython:processor='traversal', op='bytecode', args='{'gremlin': [['V'], ['has', 'vertex_label', '<label>'], ['fold']], 'aliases': {'g': 'g'}}'
DEBUG:asyncio:Using selector: EpollSelector
When the other processes send their queries, I have similar logs:
DEBUG:gremlinpython:submit with bytecode '[['V', 16456], ['repeat', [['local', [['both'], ['sample', 1]]]]], ['times', 10], ['path']]'
DEBUG:gremlinpython:message '[['V', 16456], ['repeat', [['local', [['both'], ['sample', 1]]]]], ['times', 10], ['path']]'
DEBUG:gremlinpython:processor='traversal', op='bytecode', args='{'gremlin': [['V', 16456], ['repeat', [['local', [['both'], ['sample', 1]]]]], ['times', 10], ['path']], 'aliases': {'g': 'g'}}'
DEBUG:asyncio:Using selector: EpollSelector
The issue seems not to reside in the query sent, but instead in the indefinite wait that ensues.
If you know of an issue with gremlinpython and multiprocessing, if there is a problem in my multi-processing code, or if you have any explanation that I may have overlooked, please explain to me! Thanks a lot to everyone reading this!
Solutions:
The first partial solution that I found is to use multi-threading instead of multiprocessing:
import configparser
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
from gremlin_python.process.anonymous_traversal import AnonymousTraversalSource, traversal
from gremlin_python.process.graph_traversal import __
import threading
class myThread(threading.Thread):
def __init__(self, thread_id, g, length):
threading.Thread.__init__(self)
self.thread_id = thread_id
self.thread_count = 0
self.gtraversal = g
self.walk_length = length
self.nb_walks = nb_walks
def run(self):
while True:
start_ids_list_lock.acquire()
try:
start_id = start_ids_list.pop(0)
start_ids_list_lock.release()
except IndexError:
start_ids_list_lock.release()
break
else:
self.thread_count += 1
random_walk = job(
vertex_id=start_id,
g=self.gtraversal,
length=self.walk_length,
nb_walks=self.nb_walks
)
random_walks_list_lock.acquire()
random_walks_list.append(random_walk)
random_walks_list_lock.release()
logging.info(f"Thread {self.thread_id}: {self.thread_count} done")
def job(vertex_id:int, g:AnonymousTraversalSource, length:int) -> str:
random_walk = g.V(vertex_id).repeat(
__.local(__.both().sample(1))
).times(length).path().next()
return ",".join(random_walk)
config = configparser.RawConfigParser()
config.read("config.ini")
jg_uri = config["JANUSGRAPH"]["URI"]
file_random_walks = config["FILES"]["RANDOM_WALKS"]
walks_length = int(config["WALKS"]["LENGTH"])
connection = DriverRemoteConnection(jg_uri, "g")
g_main = traversal().withRemote(connection)
threads = []
start_ids_list = []
random_walks_list = []
random_walks_list_lock = threading.Lock()
start_ids_list_lock = threading.Lock()
start_ids_list = [vertex.id for vertex in g_main.V().has("vertex_label", "<label>").fold().next()]
nb_vertices = len(start_ids_list)
nb_threads = 6
for i in range(nb_threads):
thread = myThread(
thread_id=i,
g=g_main,
length=walks_length
)
thread.start()
threads.append(thread)
for t in threads:
t.join()
# Do something with the random walks
...
This solution is effectively working and improves the execution time of the program. This isn't a full answer though, as it doesn't explain why the multiprocessing is not performing as I expected.
In multiprocessing, I wanted to update manager.dict(), it's being updated... But some data are getting skipped while updating? What can be done?
It's something similar to this...
from multiprocessing import Process, Manager
manager = Manager()
a = manager.dict()
a['url_info'] = manager.list()
def parse_link(link):
# parse link, pared_info returns dict
pared_info = link_parser(link)
a['url_info'].append(pared_info)
# Links contains a lot of url that needs to be parsed.
links = ["https://url.com/1","https://url.com/2", "https://url.com/3"]
processes = []
for link in links:
p = Process(target=parse_link, args=link,))
p.start()
processes.append(p)
for process in processes:
process.join()
link_parser() is a function that returns a dictionary, which contains the information about the scraped/parsed webpage.
> print(list(a['url_info']))
> ['#info_1', '#info_3']
Here the multiprocessing program skipped updating #info_2 in the list (aka Array). Help me please
Here's some code that demonstrates an improved structure for what you're trying to do.
Obviously it doesn't have the detail of your link_parser() but you'll get the point.
from concurrent.futures import ProcessPoolExecutor
from multiprocessing import Manager
from functools import partial
LINKS = ['abc', 'def', 'ghi']
KEY = 'url_info'
def parse_link(a, link):
a[KEY].append(link)
def main():
with Manager() as manager:
a = manager.dict()
a[KEY] = manager.list()
with ProcessPoolExecutor() as executor:
executor.map(partial(parse_link, a), LINKS)
print(a[KEY])
if __name__ == '__main__':
main()
Output:
['abc', 'def', 'ghi']
I'm trying to implement multithreading to a very time consuming program, and I've come across this SO answer:
https://stackoverflow.com/a/28463266/3451339, which basically offers this solution for multiple arrays:
from multiprocessing.dummy import Pool as ThreadPool
pool = ThreadPool(4)
results = pool.map(my_function, my_array)
# Close the pool and wait for the work to finish
pool.close()
pool.join()
and, passing multiple arrays:
results = pool.starmap(function, zip(list_a, list_b))
The following is the code I have so far which must be refactored with threading. It iterates over 4 arrays, and needs to pass arguments to the function at each iteration and append all results to a final container:
strategies = ['strategy_1', 'strategy_2']
budgets = [90,100,110,120,130,140,150,160]
formations=['343','352','433','442','451','532','541']
models = ['model_1', 'model_2', 'model_3']
all_teams = pd.DataFrame()
for strategy in strategies:
for budget in budgets:
for formation in formations:
for model in models:
team = function(strategy=strategy,
budget=budget,
curr_formation=formation,
model=model)
all_teams = all_teams.append(team, ignore_index=True, sort=False)\
.reset_index(drop=True)\
.copy()
Note: Each function call makes api web requests.
What is the way to go with multithreading in this scenario?
Python has the multiprocessing module which can run multiple tasks in parallel and inside each process you can have multiple threads or async io code
Here is a working example which uses 3 Processes and Multithreading
import pandas as pd
import multiprocessing
from multiprocessing import Queue
from threading import Thread
strategies = ['strategy_1', 'strategy_2']
budgets = [90,100,110,120,130,140,150,160]
formations=['343','352','433','442','451','532','541']
models = ['model_1', 'model_2', 'model_3']
#shared Queue if you want to reduce write locking use 3 Queues
Q = Queue()
# Retrive async if you want to speed up the process
def function(q,strategy,budget,curr_formation,model):
q.put("Team")
def runTask(model,q):
for strategy in strategies:
for budget in budgets:
for formation in formations:
Thread(target=function,args=(q,strategy,budget,formation,model)).start()
def main():
p1 = multiprocessing.Process(target=runTask, args=('model_1',Q))
p2 = multiprocessing.Process(target=runTask, args=('model_2',Q))
p3 = multiprocessing.Process(target=runTask, args=('model_3',Q))
p1.start()
p2.start()
p3.start()
p1.join()
p2.join()
p3.join()
all = []
for i in range(0,Q.qsize()):
all.append(Q.get())
print(all)
print(len(all))
if __name__ == "__main__":
main()
A usefull article Multiprocessing in Python | Set 2
This can be one approach.
Note: Thread vs multiProcess. In this SO, I have provided execution through map, that will not work here as map has limitation on number.
Run your nested for loops and build a list of parameters ==> financial_options
for strategy in strategies:
for budget in budgets:
for formation in formations:
for model in models:
financial_options.append([strategy,budget,formation,model])
financial_options_len=len(financial_options)
Create a new function that will handle API calls
def access_url(url,parameter_list):
#response=requests.get(url) # request goes here
print(parameter_list)
time.sleep(2)
print("sleep done!")
return "Hello"#,parameter_list # return type
now run the threading with these permutation parameters. so complete program will look like this:
import concurrent.futures
import requests # just in case needed
from bs4 import BeautifulSoup # just in case needed
import time
import pandas as pd
def access_url(url,parameter_list):
#response=requests.get(url) # request goes here
print(parameter_list)
time.sleep(2)
print("sleep done!")
return "Hello"#,parameter_list # return type
def multi_threading():
test_url="http://bla bla.com/"
base_url=test_url
THREAD_MULTI_PROCESSING= True
strategies = ['strategy_1', 'strategy_2']
budgets = [90,100,110,120,130,140,150,160]
formations=['343','352','433','442','451','532','541']
models = ['model_1', 'model_2', 'model_3']
all_teams = pd.DataFrame()
start = time.perf_counter() # start time for performance
financial_options=[]
decision_results=[]
for strategy in strategies:
for budget in budgets:
for formation in formations:
for model in models:
financial_options.append([strategy,budget,formation,model])
financial_options_len=len(financial_options)
print(f"Total options:{financial_options_len}")
future_list = []
THREAD_MULTI_PROCESSING_LOOP=True
if THREAD_MULTI_PROCESSING_LOOP:
with concurrent.futures.ThreadPoolExecutor() as executor: # Through executor
for each in range(financial_options_len):
future = executor.submit(access_url,test_url,financial_options[each]) # submit each option
future_list.append(future)
for f1 in concurrent.futures.as_completed(future_list):
r1=f1.result()
decision_results.append(r1)
end = time.perf_counter() # finish time for performance
print(f'Threads: Finished in {round(end - start,2)} second(s)')
df=pd.DataFrame(decision_results)
df.to_csv("multithread_for.csv")
return df,decision_results
df,results=multi_threading()
I'm using ProcessPoolExecutor context manager to run several Kafka consumers in parallel. I need to store the process IDs of the child processes so that later, I can cleanly terminate those processes. I have such code:
Class MultiProcessConsumer:
...
def run_in_parallel(self):
parallelism_factor = 5
with ProcessPoolExecutor() as executor:
processes = [executor.submit(self.consume) for _ in range(parallelism_factor)]
# It would be nice If I could write [process.pid for process in processes] to a file here.
def consume(self):
while True:
for message in self.kafka_consumer:
do_stuff(message)
I know I can use os.get_pid() in the consume method to get PIDs. But, handling them properly (in case of constant shutting down or starting up of consumers) requires some extra work.
How would you propose that I get and store PIDs of the child processes in such a context?
os.get_pid() seems to be the way to go. Just pass them through a Queue or Pipe in combination with maybe some random UUID that you pass to the process before to identify the PID.
from concurrent.futures import ProcessPoolExecutor
import os
import time
import uuid
#from multiprocessing import Process, Queue
import multiprocessing
import queue
#The Empty exception in in Queue, multiprocessing borrows
#it from there
# https://stackoverflow.com/questions/9908781/sharing-a-result-queue-among-several-processes
m = multiprocessing.Manager()
q = m.Queue()
def task(n, queue, uuid):
my_pid = os.getpid()
print("Executing our Task on Process {}".format(my_pid))
queue.put((uuid, my_pid))
time.sleep(n)
return n * n
def main():
with ProcessPoolExecutor(max_workers = 3) as executor:
some_dict = {}
for i in range(10):
print(i)
u = uuid.uuid4()
f = executor.submit(task, i, q, u)
some_dict[u] = [f, None] # PID not known here
try:
rcv_uuid, rcv_pid = q.get(block=True, timeout=1)
some_dict[rcv_uuid][1] = rcv_pid # store PID
except queue.Empty as e:
print('handle me', e)
print('I am', rcv_uuid, 'and my PID is', rcv_pid)
if __name__ == '__main__':
main()
Although this field is private, you could use the field in PoolProcessExecutor self._processes. The code snippet below shows how to use this variable.
import os
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import wait
nb_processes = 100
executor = ProcessPoolExecutor(nb_processes )
futures = [executor.submit(os.getpid) for _ in range(nb_processes )]
wait(futures)
backends = list(map(lambda x: x.result(), futures))
assert len(set(backends)) == nb_processes
In the case above, an assertion error is raised. This is because a new task can reuse the forked processes in the pool. You cannot know all forked process IDs through the method you memtioned. Hence, you can do as:
import os
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import wait
nb_processes = 100
executor = ProcessPoolExecutor(nb_processes )
futures = [executor.submit(os.getpid) for _ in range(nb_processes )]
wait(futures)
backends = list(map(lambda x: x.result(), futures))
assert len(set(executor._processes.keys())) == nb_processes
print('all of PID are: %s.' % list(executor._processes.keys()))
If you don't want to destroy the encapsulation, you could inhert the ProcessPoolExecutor and create a new property for that.