Python Multiprocessing queue is not getting all the elements to process

Python Multiprocessing queue is not getting all the elements to process - python

For all the Active campaigns, I have to query TSDB API for a date period to fetch data for each Campaign ID.so I get all the Campaign ids from the Db and put it to queue. In Db, I have 430 active campaign ids.
But python code is terminating after some 100 entries, don't know the reason, can somebody guide me here, but if I removed the API query fetching code and just prints the queue value get(q.get()), the Id value to fetch API is coming.
below is the code
import mysql.connector
from datetime import datetime,timedelta
from datetime import date
import requests
import json
from collections import OrderedDict
from multiprocessing import Pool, Queue
from os import getpid
from time import sleep
from random import random
db = mysql.connector.connect(
host='HOSTNAME',
database='DB',
user='ROOT',
password='PASSWORD',
port='PORT'
)
print("Connection ID:", db.connection_id)
MAX_WORKERS=10
class Testing_mp(object):
def __init__(self):
"""
Initiates a queue, a pool and a temporary buffer, used only
when the queue is full.
"""
self.q = Queue()
self.pool = Pool(processes=MAX_WORKERS, initializer=self.worker_main,)
self.temp_buffer = []
def add_to_queue(self, msg):
"""
If queue is full, put the message in a temporary buffer.
If the queue is not full, adding the message to the queue.
If the buffer is not empty and that the message queue is not full,
putting back messages from the buffer to the queue.
"""
if self.q.full():
print("QISFULL",msg)
self.temp_buffer.append(msg)
else:
self.q.put(msg)
if len(self.temp_buffer) > 0:
add_to_queue(self.temp_buffer.pop())
def write_to_queue(self):
"""
This function writes some messages to the queue.
"""
mycursor = db.cursor()
mycursor.execute("select Id from Campaign where Status='ACTIVE' order by Id desc")
myresult = mycursor.fetchall()
for x in myresult:
self.add_to_queue(x[3])
sleep(random()*2)
db.close() # close the connection
def worker_main(self):
"""
Waits indefinitely for an item to be written in the queue.
Finishes when the parent process terminates.
"""
print "Process {0} started".format(getpid())
while True:
# If queue is not empty, pop the next element and do the work.
# If queue is empty, wait indefinitly until an element get in the queue.
item = self.q.get(block=True,timeout=None)
start_date=datetime.today()
start_date=start_date.date()
end_date = start_date - timedelta(days=8)
start_date = start_date - timedelta(days=1)
print "{0} retrieved: {1}".format(getpid(), item)
#print("STARTDATE",type(start_date))
start_date_ft=start_date.strftime('%Y/%m/%d')
##print("ENDDATE",end_date)
end_date_ft=end_date.strftime('%Y/%m/%d')
url = "http://tsdb.metrics.com:4343/api/query"
if item is not None:
querystring = {"start":end_date_ft,"end":start_date_ft,"m":"avg:1d-avg:percentization{campaign="+str(item)+",type=seen}"}
print(querystring)
response = requests.request("GET", url,params=querystring)
print(response.text)
if response and response.text is not None:
loaded_json = json.loads(response.text,object_pairs_hook=OrderedDict)
for x in loaded_json:
for attribute, value in x.items():
if attribute is not None and attribute=="dps":
dps_data=loaded_json[0][attribute]
perValue=[]
if len(dps_data)>0:
for key,val in dps_data.items():
perValue.append(str(val))
print(str(item)+"==ITEM=="+key+"="+str(val))
print(perValue)
# simulate some random length operations
sleep(random()*1)
# Warning from Python documentation:
# Functionality within this package requires that the __main__ module be
# importable by the children. This means that some examples, such as the
# multiprocessing.Pool examples will not work in the interactive interpreter.
if __name__ == '__main__':
mp_class = Testing_mp()
mp_class.write_to_queue()
# Waits a bit for the child processes to do some work
# because when the parent exits, childs are terminated.
sleep(5)

Related

Multiprocessing gremlinpython queries

Goal:
Accelerate the random walk generation by using multiple processes.
Get the list of vertices ids from which I want random walks to be generated in an input queue
Start as much processes as possible with the correct parameters
Make them put the random walks into an output queue
Wait for completion
Read the output queue
What I am doing:
# Libraries imports
from multiprocessing import cpu_count, Process, Queue
import queue
import configparser
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
from gremlin_python.process.anonymous_traversal import AnonymousTraversalSource, traversal
from gremlin_python.process.graph_traversal import __
# Function the processes are supposed to execute
def job(proc_id:int, siq:Queue, rwq:Queue, g:AnonymousTraversalSource, length:int):
while True:
try:
# Get next element in ids queue
start_id = siq.get_nowait()
except queue.Empty:
# If the ids queue is empty, then terminate
break
else:
# Do a random walk of length <length> from the vertex with id <start_id>
random_walk = g.V(start_id).repeat(
__.local(__.both().sample(1))
).times(length).path().next()
print(f"{proc_id}: rw obtained")
# Transform the list of vertices into a comma-separated string of ids
rwq.put(",".join(
[str(v.id) for v in random_walk]
))
print(f"{proc_id}: rw handled")
if __name__ == "__main__":
# Get the parameters from the <config.ini> configuration file
config = configparser.RawConfigParser()
config.read("config.ini")
jg_uri = config["JANUSGRAPH"]["URI"]
file_random_walks = config["FILES"]["RANDOM_WALKS"]
walks_nb_per_node = int(config["WALKS"]["NB_PER_NODE"])
walks_length = int(config["WALKS"]["LENGTH"])
# Connect to Janus Graph
connection = DriverRemoteConnection(jg_uri, "g")
g_main = traversal().withRemote(connection)
# Instantiate the queues and populate the ids one
start_ids_queue = Queue()
random_walks_queue = Queue()
for vertex in g_main.V().has("vertex_label", "<label>").fold().next():
start_ids_queue.put(vertex.id)
# Create and start the processes
nb_processes = cpu_count()
processes = []
for i in range(nb_processes):
p = Process(target=job, args=(
i,
start_ids_queue,
random_walks_queue,
g_main,
walks_length
))
processes.append(p)
p.start()
for p in processes:
p.join()
# Once the processes are terminated, read the random walks queue
random_walks = []
while not random_walks_queue.empty():
random_walks.append(random_walks_queue.get())
# Do something with the random walks
...
Issue:
Once the processes are started, nothing seems to happen. I never get the X: rw obtained/X: rw handled messages. With a bit more logging, I can see that the queries have been sent yet isn't finishing.
In the logs, when performing the first g_main.V().has("vertex_label", "<label>").fold().next() in the main process (when I populate the ids queue), I have the following message:
DEBUG:gremlinpython:submit with bytecode '[['V'], ['has', 'vertex_label', 'movie'], ['fold']]'
DEBUG:gremlinpython:message '[['V'], ['has', 'vertex_label', '<label>'], ['fold']]'
DEBUG:gremlinpython:processor='traversal', op='bytecode', args='{'gremlin': [['V'], ['has', 'vertex_label', '<label>'], ['fold']], 'aliases': {'g': 'g'}}'
DEBUG:asyncio:Using selector: EpollSelector
When the other processes send their queries, I have similar logs:
DEBUG:gremlinpython:submit with bytecode '[['V', 16456], ['repeat', [['local', [['both'], ['sample', 1]]]]], ['times', 10], ['path']]'
DEBUG:gremlinpython:message '[['V', 16456], ['repeat', [['local', [['both'], ['sample', 1]]]]], ['times', 10], ['path']]'
DEBUG:gremlinpython:processor='traversal', op='bytecode', args='{'gremlin': [['V', 16456], ['repeat', [['local', [['both'], ['sample', 1]]]]], ['times', 10], ['path']], 'aliases': {'g': 'g'}}'
DEBUG:asyncio:Using selector: EpollSelector
The issue seems not to reside in the query sent, but instead in the indefinite wait that ensues.
If you know of an issue with gremlinpython and multiprocessing, if there is a problem in my multi-processing code, or if you have any explanation that I may have overlooked, please explain to me! Thanks a lot to everyone reading this!
Solutions:
The first partial solution that I found is to use multi-threading instead of multiprocessing:
import configparser
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
from gremlin_python.process.anonymous_traversal import AnonymousTraversalSource, traversal
from gremlin_python.process.graph_traversal import __
import threading
class myThread(threading.Thread):
def __init__(self, thread_id, g, length):
threading.Thread.__init__(self)
self.thread_id = thread_id
self.thread_count = 0
self.gtraversal = g
self.walk_length = length
self.nb_walks = nb_walks
def run(self):
while True:
start_ids_list_lock.acquire()
try:
start_id = start_ids_list.pop(0)
start_ids_list_lock.release()
except IndexError:
start_ids_list_lock.release()
break
else:
self.thread_count += 1
random_walk = job(
vertex_id=start_id,
g=self.gtraversal,
length=self.walk_length,
nb_walks=self.nb_walks
)
random_walks_list_lock.acquire()
random_walks_list.append(random_walk)
random_walks_list_lock.release()
logging.info(f"Thread {self.thread_id}: {self.thread_count} done")
def job(vertex_id:int, g:AnonymousTraversalSource, length:int) -> str:
random_walk = g.V(vertex_id).repeat(
__.local(__.both().sample(1))
).times(length).path().next()
return ",".join(random_walk)
config = configparser.RawConfigParser()
config.read("config.ini")
jg_uri = config["JANUSGRAPH"]["URI"]
file_random_walks = config["FILES"]["RANDOM_WALKS"]
walks_length = int(config["WALKS"]["LENGTH"])
connection = DriverRemoteConnection(jg_uri, "g")
g_main = traversal().withRemote(connection)
threads = []
start_ids_list = []
random_walks_list = []
random_walks_list_lock = threading.Lock()
start_ids_list_lock = threading.Lock()
start_ids_list = [vertex.id for vertex in g_main.V().has("vertex_label", "<label>").fold().next()]
nb_vertices = len(start_ids_list)
nb_threads = 6
for i in range(nb_threads):
thread = myThread(
thread_id=i,
g=g_main,
length=walks_length
)
thread.start()
threads.append(thread)
for t in threads:
t.join()
# Do something with the random walks
...
This solution is effectively working and improves the execution time of the program. This isn't a full answer though, as it doesn't explain why the multiprocessing is not performing as I expected.

How do I get child process PIDs when using ProcessPoolExecuter?

I'm using ProcessPoolExecutor context manager to run several Kafka consumers in parallel. I need to store the process IDs of the child processes so that later, I can cleanly terminate those processes. I have such code:
Class MultiProcessConsumer:
...
def run_in_parallel(self):
parallelism_factor = 5
with ProcessPoolExecutor() as executor:
processes = [executor.submit(self.consume) for _ in range(parallelism_factor)]
# It would be nice If I could write [process.pid for process in processes] to a file here.
def consume(self):
while True:
for message in self.kafka_consumer:
do_stuff(message)
I know I can use os.get_pid() in the consume method to get PIDs. But, handling them properly (in case of constant shutting down or starting up of consumers) requires some extra work.
How would you propose that I get and store PIDs of the child processes in such a context?

os.get_pid() seems to be the way to go. Just pass them through a Queue or Pipe in combination with maybe some random UUID that you pass to the process before to identify the PID.
from concurrent.futures import ProcessPoolExecutor
import os
import time
import uuid
#from multiprocessing import Process, Queue
import multiprocessing
import queue
#The Empty exception in in Queue, multiprocessing borrows
#it from there
# https://stackoverflow.com/questions/9908781/sharing-a-result-queue-among-several-processes
m = multiprocessing.Manager()
q = m.Queue()
def task(n, queue, uuid):
my_pid = os.getpid()
print("Executing our Task on Process {}".format(my_pid))
queue.put((uuid, my_pid))
time.sleep(n)
return n * n
def main():
with ProcessPoolExecutor(max_workers = 3) as executor:
some_dict = {}
for i in range(10):
print(i)
u = uuid.uuid4()
f = executor.submit(task, i, q, u)
some_dict[u] = [f, None] # PID not known here
try:
rcv_uuid, rcv_pid = q.get(block=True, timeout=1)
some_dict[rcv_uuid][1] = rcv_pid # store PID
except queue.Empty as e:
print('handle me', e)
print('I am', rcv_uuid, 'and my PID is', rcv_pid)
if __name__ == '__main__':
main()

Although this field is private, you could use the field in PoolProcessExecutor self._processes. The code snippet below shows how to use this variable.
import os
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import wait
nb_processes = 100
executor = ProcessPoolExecutor(nb_processes )
futures = [executor.submit(os.getpid) for _ in range(nb_processes )]
wait(futures)
backends = list(map(lambda x: x.result(), futures))
assert len(set(backends)) == nb_processes
In the case above, an assertion error is raised. This is because a new task can reuse the forked processes in the pool. You cannot know all forked process IDs through the method you memtioned. Hence, you can do as:
import os
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import wait
nb_processes = 100
executor = ProcessPoolExecutor(nb_processes )
futures = [executor.submit(os.getpid) for _ in range(nb_processes )]
wait(futures)
backends = list(map(lambda x: x.result(), futures))
assert len(set(executor._processes.keys())) == nb_processes
print('all of PID are: %s.' % list(executor._processes.keys()))
If you don't want to destroy the encapsulation, you could inhert the ProcessPoolExecutor and create a new property for that.

pickling issue while using pool to count check the file

I have my code that is sprawning multiple processes to check the count of files and maintaining the records in the database. The code which is working is mentioned below :
import multiprocessing as mp
from multiprocessing import Pool
import os
import time
import mysql.connector
"""Function to check the count of the file"""
def file_wc(fname):
with open('/home/vaibhav/Desktop/Input_python/'+ fname) as f:
count = sum(1 for line in f)
return (fname,count)
class file_audit:
def __init__(self):
"""Initialising the constructor for getting the names of files
and refrencing the outside class function"""
folder = '/home/vaibhav/Desktop/Input_python'
self.fnames = (name for name in os.listdir(folder))
self.file_wc=file_wc
def count_check(self):
"Creating 4 worker threads to check the count of the file parallelly"
pool = Pool(4)
self.m=list(pool.map(self.file_wc, list(self.fnames),4))
pool.close()
pool.join()
def database_updation(self):
"""To maintain an entry in the database with details
like filename and recrods present in the file"""
self.db = mysql.connector.connect(host="localhost",user="root",password="root",database="python_showtime" )
# prepare a cursor object using cursor() method
self.cursor = self.db.cursor()
query_string = ("INSERT INTO python_showtime.audit_capture"
"(name,records)"
"VALUES(%s,%s)")
#data_user = (name,records)
for each in self.m:
self.cursor.execute(query_string, each)
self.db.commit()
self.cursor.close()
start_time = time.time()
print("My program took", time.time() - start_time, "to run")
#if __name__ == '__main__':
x=file_audit()
x.count_check() #To check the count by sprawning multiple processes
x.database_updation() #To maintain the entry in the database
Point to be considered
Now if i put my function inside the class and comment self.file_wc=file_wc in the constructor section i get the Error can't pickle on generator objects. I got some fair understanding like we cannot pickle some objects,So want to know what exactly is happening at the background in very simple terms. I got the reference from here or here to make the code working

Python Multiple requests

I have a situation to call multiple requests in a scheduler job to check live user status for 1000 users at a time. But server limits maximum up to 50 users in each hit of an API request. So using following approach with for loop its taking around 66 seconds for 1000 users (i.e for 20 API calls).
from apscheduler.schedulers.blocking import BlockingScheduler
sched = BlockingScheduler()
def shcdulerjob():
"""
"""
uidlist = todays_userslist() #Get around 1000 users from table
#-- DIVIDE LIST BY GIVEN SIZE (here 50)
split_list = lambda lst, sz: [lst[i:i+sz] for i in range(0, len(lst), sz)]
idlists = split_list(uidlist, 50) # SERVER MAX LIMIT - 50 ids/request
for idlist in idlists:
apiurl = some_server_url + "&ids="+str(idlist)
resp = requests.get(apiurl)
save_status(resp.json()) #-- Save status to db
if __name__ == "__main__":
sched.add_job(shcdulerjob, 'interval', minutes=10)
sched.start()
So,
Is there any workaround so that it should optimize the time required to fetch API?
Does Python- APScheduler provide any multiprocessing option to process such api requests in a single job?

You could try to apply python's Thread pool from the concurrent.futures module, if the server allows concurrent requests. That way you would parallelise the processing, instead of the scheduling itself
There are some good examples provided in the documentation here (If you're using python 2, there is a sort of an equivalent module
e.g.
import concurrent.futures
import multiprocessing
import requests
import time
import json
cpu_start_time = time.process_time()
clock_start_time = time.time()
queue = multiprocessing.Queue()
uri = "http://localhost:5000/data.json"
users = [str(user) for user in range(1, 50)]
with concurrent.futures.ThreadPoolExecutor(multiprocessing.cpu_count()) as executor:
for user_id, result in zip(
[str(user) for user in range(1, 50)]
, executor.map(lambda x: requests.get(uri, params={id: x}).content, users)
):
queue.put((user_id, result))
while not queue.empty():
user_id, rs = queue.get()
print("User ", user_id, json.loads(rs.decode()))
cpu_end_time = time.process_time()
clock_end_time = time.time()
print("Took {0:.03}s [{1:.03}s]".format(cpu_end_time-cpu_start_time, clock_end_time-clock_start_time))
If you want to use a Process pool, just make sure you don't use shared resources, e.g. queue, and write your data our independently

Multi-processing and Queue

After some lookup into google and posts of stackoverflow and other sites, i'm still confused on how i can apply a queue and threading on my code:
import psycopg2
import sys
import re
# for threading and queue
import multiprocessing
from multiprocessing import Queue
# for threading and queue
import time
from datetime import datetime
class Database_connection():
def db_call(self,query,dbHost,dbName,dbUser,dbPass):
try:
con = None
con = psycopg2.connect(host=dbHost,database=dbName,
user=dbUser,password=dbPass)
cur = con.cursor()
cur.execute(query)
data = cur.fetchall()
resultList = []
for data_out in data:
resultList.append(data_out)
return resultList
except psycopg2.DatabaseError, e:
print 'Error %s' % e
sys.exit(1)
finally:
if con:
con.close()
w = Database_connection()
sql = "select stars from galaxy"
startTime = datetime.now()
for result in w.db_call(sql, "x", "x", "x", "x"):
print result[0]
print "Runtime: " + str(datetime.now()-startTime)
lets supose the result will be 100+ values. How can i, put those 100+ results on queue and execute ( print, for example ) then 5 at time using queue and multiprocessing module?

What do you want this code to do?
You get no output from this code because get() returns the next item from the queue (doc). You are putting the letters from the sql response into the queue one letter at a time. the i in for i... is looping over the list returned by w.db_call. Those items are (I assume) strings, which you are then iterating over and adding one at a time to the queue. The next thing you do is to remove the element you just added to the queue from the queue, which leaves the queue unchanged over each pass through the loop. If you put a print statement in the loop it prints out the letter it just got from the queue.
Queues are used to pass information between processes. I think you are trying to set-up a producer/consumer pattern where you have one process add things to the queue and multiple other processes which consume things from the queue. See working example of multiprocessing.Queue and links contained there in (example, main documentation).
Probably the simplest way to get this working, as long as you don't need it to run in an interactive shell, is to use Pool (lifted almost verbatim from the documentation of multiprocess)
from multiprocessing import Pool
p = Pool(5) # sets the number of worker threads you want
def f(res):
# put what ever you want to do with each of the query results in here
return res
result_lst = w.db_call(sql, "x", "x", "x", "x")
proced_results = p.map(f, result_lst)
which apply what ever you want to do to each result (written into the function f) and returns the results of that manipulation as a list. The number of sub-processes to use is set by the argument to Pool.

This is my suggestion...
import Queue
from threading import Thread
class Database_connection:
def db_call(self,query,dbHost,dbName,dbUser,dbPass):
# your code here
return
# in this example each thread will execute this function
def processFtpAddrMt(queue):
# loop will continue until queue containing FTP addresses is empty
while True:
# get an ftp address, a exception will be called when the
# queue is empty and the loop will break
try: ftp_addr = queue.get()
except: break
# put code to process the ftp address here
# let queue know this task is done
queue.task_done()
w = Database_connection()
sql = "select stars from galaxy"
ftp_addresses = w.db_call(sql, "x", "x", "x", "x")
# put each result of the SQL call in a Queue class
ftp_addr_queue = Queue.Queue()
for addr in ftp_addresses:
ftp_addr_queue.put(addr)
# create five threads where each one will run analyzeFtpResult
# pass the queue to the analyzeFtpResult function
for x in range(0,5):
t = Thread(target=processFtpAddrMt,args=(ftp_addr_queue,))
t.setDaemon(True)
t.start()
# blocks further execution of the script until all queue items have been processed
ftp_addr_queue.join()
It uses the Queue class to store your SQL results and then the Thread class to process the queue. Five thread classes are created and each one uses a processFtpAddrMt function which take ftp addresses from the queue until the queue is empty. All you have to do is add the code for processing the ftp address. Hope this helps.

I was able to solve the problem with the following:
def worker():
w = Database_connection()
sql = "select stars from galaxy"
for result in w.db_call(sql, "x", "x", "x", "x"):
if result:
jobs = []
startTime = datetime.now()
for i in range(1):
p = multiprocessing.Process(target=worker)
jobs.append(p)
p.start()
print "Runtime: " + str(datetime.now()-startTime)
I belive it is not the best way to do it, but for now solved my problem :)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python Multiprocessing queue is not getting all the elements to process - python

Related

Multiprocessing gremlinpython queries

How do I get child process PIDs when using ProcessPoolExecuter?

pickling issue while using pool to count check the file

Python Multiple requests

Multi-processing and Queue

Categories

Resources