How do speed up this test code in python to Redis on Winxp using python 2.7?
Would multiprocessing be better? The load rate in 6000/s vs publish 100,000/s rates.
I chose 100,000, but could lower in testing. The process takes 15 seconds.
Would changing setting on server help???
import time
from time import strftime
import redis
import threading, Queue
start_time = time.time()
cxn = redis.StrictRedis('127.0.0.1',6379,1)
class WorkerMain(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while 1:
try: # take a job from the queue
row = self.queue.get_nowait()
except Queue.Empty: raise SystemExit
try:
cxn.set(row, "Row")
#print (row, "Row")
except: print 'Setup Error'
if __name__ == '__main__':
connections = 5
sml = range(1,100000)
queue = Queue.Queue()
for row in sml:
queue.put(str(row))
threads = []
for dummy in range(connections):
t = WorkerMain(queue)
t.start()
threads.append(t)
# wait for all threads to finish
for thread in threads:
thread.join()
print
end_time = time.time()
duration = end_time - start_time
print "Duration: %s" % duration
Used the code below for mulitprocessing and "monitored" the data with CLI...not all data went into the server.
from multiprocessing import Pool
import time
import redis
start_time = time.time()
cxn = redis.Redis('127.0.0.1',6379,1)
def rset(var):
cxn.set(var,"value")
if __name__ =='__main__':
sml = range(1,10000)
#for x in sml:print x
pool = Pool(processes=5)
for row in sml:
pool.apply_async(rset, [(row,)])
#print result.get(),
end_time = time.time()
duration = end_time - start_time
print "Duration: %s" % duration
Here is the pipelined code...... I just commented out the threading stuff.
from time import strftime
import redis
import threading, Queue
start_time = time.time()
cxn = redis.StrictRedis('127.0.0.1',6379,0)
pipe = cxn.pipeline(transaction=False)
class WorkerMain(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while 1:
try: # take a job from the queue
row = self.queue.get_nowait()
except Queue.Empty: raise SystemExit
try:
cxn.set(row, "Row")
#print (row, "ROw")
except: print 'Setup Error'
if __name__ == '__main__':
#connections = 5
sml = range(1,100000)
#queue = Queue.Queue()
for row in sml:
#queue.put(str(row))
pipe.set(str(row),"value").execute()# key, value
# threads = []
# for dummy in range(connections):
# t = WorkerMain(queue)
# t.start()
# threads.append(t)
#
# # wait for all threads to finish
# for thread in threads:
# thread.join()
print
end_time = time.time()
duration = end_time - start_time
print "Duration: %s" % duration
Use Pipelines. A Pipeline batches commands so you don't pay for network overheads.
See :
Section on Pipelines over here https://github.com/andymccurdy/redis-py
Pipelining on Redis.io - http://redis.io/topics/pipelining
Using threading for better performance is not a really good idea if you use cpython (the standard python interpreter) because of the gil.
http://wiki.python.org/moin/GlobalInterpreterLock
multiprocessing should work better
Related
i have one scraper which initiate the "requestes" session and fetch some data, using a IPV6, i have now 10000 ip list, I have prepared it using threading, but its giving error.
Need support to find out the issue.
import requests, queue,threading, urllib3,jso,pandas as pd, os, time, datetime,inspect
num_threads = 2
root = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
with open (root+ "/ip_list.txt") as ips:
device_ip = list(ips)
class Writer_Worker(threading.Thread):
def __init__(self, queue, df, *args, **kwargs):
if not queue:
print("Device Queue not specified")
exit(1)
self.out_q = queue
self.df = df
super().__init__(*args, **kwargs)
def run(self):
while True:
try:
device_details = self.out_q.get(timeout=3)
except queue.Empty:
return
self.df[device_details[0]] = device_details
self.out_q.task_done()
class Worker(threading.Thread):
def __init__(self, queue, out_queue, device_password, *args, **kwargs):
if not queue:
print("Device Queue not specified")
exit(1)
self.queue = queue
self.pas = device_password
self.out_q = out_queue
super().__init__(*args, **kwargs)
def run(self):
while True:
try:
device_ip = self.queue.get(timeout=3)
except queue.Empty:
return
self.connect_to_device_and_process(device_ip)
self.queue.task_done()
def connect_to_device_and_process(self, device_ip):
st = str("Online")
try:
r = requests.post("https://["+device_ip+"]/?q=index.login&mimosa_ajax=1", {"username":"configure", "password":self.pas}, verify=False)
except requests.exceptions.ConnectionError:
st = str("Offline")
self.out_q.put([device_ip,st,"","","","","","","","","","","","","","","","","",""])
return
finally:
if 'Online' in st:
r = requests.get("https://["+device_ip+"]/cgi/dashboard.php", cookies=r.cookies, verify=False)
if "Response [401]" in str(r):
st2 = str("Password Error")
self.out_q.put([device_ip,st2,"","","","","","","","","","","","","","","","","",""])
else:
data = json.loads(r.content.decode())
output5 = data ['config'] ['Spectrum_Power']
self.out_q.put([device_ip,st,output5['Auto_Power'].replace('2', 'Max Power').replace('1', 'Min Power').replace('0', 'off'),output5['AutoConfig']])
def main():
start = time.time()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
pas = input("Enter Device Password:")
df =pd.DataFrame(columns = ["IP","Status","Auto_Power","AutoChannel"])
q = queue.Queue(len(device_ip))
for ip in device_ip:
q.put_nowait(ip)
out_q = queue.Queue(len(device_ip))
Writer_Worker(out_q, df).start()
for _ in range(num_threads):
Worker(q, out_q, pas).start()
q.join()
print(df)
df.to_excel('iBridge_C5x_Audit_Report.xlsx', sheet_name='Detail', index = False)
if __name__ == "__main__":
main()
below is the error while running the script, seeps I am unable to login to this device.
Any help is appreciable.
You should use a thread pool that distributes the work between a fixed number of threads. This is a core feature of Python since version 3.2.
from concurrent.futures import ThreadPoolExecutor
Define a function perform(ip) that performs the request for one ip
Set variable numThreads to the number of desired threads
Run the thread-pool executor:
print(f'Using {numThreads} threads')
with ThreadPoolExecutor(max_workers=numThreads) as pool:
success = all(pool.map(perform, ips))
Source: https://docs.python.org/3/library/concurrent.futures.html
On that page you find an example even better tailored to your application: https://docs.python.org/3/library/concurrent.futures.html#threadpoolexecutor-example
from threading import Thread
th = Thread(target=self.fill_imdb, args=(movies_info_part, "thread " + str(count)))
th.start()
fill_imdb is my method
I am working on a scraper that rotates the ips, i have created a small mvp in a notebook that works as expected:
import logging
import time
import random
import threading
from datetime import datetime
from datetime import timedelta
logging.basicConfig(
level=logging.DEBUG,
format='(%(threadName)-10s) %(message)s',
)
class Controller(object):
def __init__(self, event):
self.start_time = datetime.now()
self.event = event
def worker(self):
while True:
if self.event.is_set():
rand_sleep_time = random.randint(1, 10) / 5
logging.debug("Sleeping for %.2f secs" % rand_sleep_time)
time.sleep(rand_sleep_time)
logging.debug("Werking")
else:
time.sleep(1)
def blocker(self):
while True:
rand_sleep_time = random.randint(3, 6)
logging.debug("Sleeping for %.2f secs" % rand_sleep_time)
time.sleep(rand_sleep_time)
if datetime.now() > self.start_time + timedelta(seconds=10):
self.event.clear() # only stop the execution for when the ip is updated
logging.debug("ALL THREADS SLEEP NOW!")
time.sleep(10)
self.event.set() # you can now proceed with the computations
self.start_time = datetime.now()
start_time = datetime.now()
e = threading.Event()
e.set()
c = Controller(e)
for thread in range(NUM_THREADS):
t = threading.Thread(target=c.worker, name='Thread-Worker-{}'.format(thread+1))
t.start()
threading.Thread(target=c.blocker, name='Thread-Blocker-1').start()
So the workers above do some work, then the blocker halts all of them for a brief moment of time, while it updates the "ip", and then the workers start the work again. Taking this logic and implementing it into production, this fails (because I assume the workers do not stop). Unfortunately, I cannot include all of the code, but here is the main part. Hopefully, this is enough, as other parts are not related to the fact that the Ip-Updater does not stop other threads. The only difference in this implementation is that I have used classes, perhaps that should be changed (because the methods have self arg and I'm chaning it. But if the Ip-Updater were to successfully stop the other threads, then there should be no problem, no?):
class ThreadedNewsParser(object):
"""
This little guy parses the news with multiple threads and dynamically changes the ip of the sessions
"""
def __init__(self, update_ip_in, num_threads, date_start, date_end):
assert isinstance(num_threads, int)
assert num_threads > 0
assert any(isinstance(date_start, type_) for type_ in [datetime, date])
assert any(isinstance(date_end, type_) for type_ in [datetime, date])
self.start_time = datetime.now()
self.event = threading.Event()
self.event.set()
self.update_ip_in = update_ip_in
self.check_ip_url = 'https://httpbin.org/ip'
autolog("STARTING WORK ON IP: {}".format(session.get(self.check_ip_url).text), logging.debug)
self.num_threads = num_threads
self.date_start = date_start
self.date_end = date_end
self.dates = [date for date in date_range(date_start, date_end)]
self.p = DailyCompanyNewsParser(2008, 1, 1) # the date here does not matter
def worker(self):
while len(self.dates) > 0:
if self.event.is_set():
print("THREAD WERKING!")
pause = random.randint(1, 5) / 5
autolog('THREAD SLEEPING %.2f' % pause, logging.debug)
time.sleep(pause)
if len(self.dates) > 0:
date = self.dates.pop(0)
self.p.get_news_for_all_stocks(verbose=True, date_=date)
else:
print("THREAD SLEEPING")
time.sleep(10) # so that the threads do not check if the event is set instantaneously
def ip_updater(self): # this is the blocker
while len(self.dates) > 0:
autolog("IP_UPDATER SLEEPING FOR: {}".format(self.update_ip_in / 4), logging.debug)
time.sleep(self.update_ip_in / 4) # do not check the condition every instance
if datetime.now() > self.start_time + timedelta(seconds=self.update_ip_in):
print("ALL THREADS SLEEP NOW!")
autolog("ALL THREADS SLEEP NOW!", logging.info)
self.event.clear() # Make all other threads sleep so that we can update the IP
time.sleep(10)
get_new_ip()
self.start_time = datetime().now()
# autolog("Obtained new IP address: {}".format(session.get(self.check_ip_url).text), logging.debug)
autolog("ALL THREADS WAKE UP NOW!", logging.info)
print("ALL THREADS WAKE UP NOW!")
self.event.set()
def run(self):
for thread in range(self.num_threads):
t = threading.Thread(target=self.worker, name='Thread-Worker-{}'.format(thread+1))
t.start()
threading.Thread(target=self.ip_updater, name='Thread-IPUpdater-1').start()
Rewriting everything so that event and start_time are global variables does not solve the issue.. For example:
class ThreadedNewsParser(object):
"""
This little guy parses the news with multiple threads and dynamically changes the ip of the sessions
"""
def __init__(self, update_ip_in, num_threads, date_start, date_end):
assert isinstance(num_threads, int)
assert num_threads > 0
assert any(isinstance(date_start, type_) for type_ in [datetime, date])
assert any(isinstance(date_end, type_) for type_ in [datetime, date])
self.update_ip_in = update_ip_in
self.check_ip_url = 'https://httpbin.org/ip'
autolog("STARTING WORK ON IP: {}".format(session.get(self.check_ip_url).text), logging.debug)
self.num_threads = num_threads
self.date_start = date_start
self.date_end = date_end
self.dates = [date for date in date_range(date_start, date_end)]
self.p = DailyCompanyNewsParser(2008, 1, 1) # the date here does not matter
def worker(self):
global event
while len(self.dates) > 0:
if event.is_set():
print("THREAD WERKING!")
pause = random.randint(1, 5) / 5
autolog('THREAD SLEEPING %.2f' % pause, logging.debug)
time.sleep(pause)
if len(self.dates) > 0:
date = self.dates.pop(0)
self.p.get_news_for_all_stocks(verbose=True, date_=date)
else:
print("THREAD SLEEPING")
time.sleep(10) # so that the threads do not check if the event is set instantaneously
def ip_updater(self): # this is the blocker
global start_time
global event
while len(self.dates) > 0:
autolog("IP_UPDATER SLEEPING FOR: {}".format(self.update_ip_in / 4), logging.debug)
time.sleep(self.update_ip_in / 4) # do not check the condition every instance
if datetime.now() > start_time + timedelta(seconds=self.update_ip_in):
print("ALL THREADS SLEEP NOW!")
autolog("ALL THREADS SLEEP NOW!", logging.info)
event.clear() # Make all other threads sleep so that we can update the IP
time.sleep(10)
get_new_ip()
start_time = datetime().now()
# autolog("Obtained new IP address: {}".format(session.get(self.check_ip_url).text), logging.debug)
autolog("ALL THREADS WAKE UP NOW!", logging.info)
print("ALL THREADS WAKE UP NOW!")
event.set()
def run(self):
for thread in range(self.num_threads):
t = threading.Thread(target=self.worker, name='Thread-Worker-{}'.format(thread+1))
t.start()
threading.Thread(target=self.ip_updater, name='Thread-IPUpdater-1').start()
I've searched StackOverflow and although I've found many questions on this, I haven't found an answer that fits for my situation/not a strong python programmer to adapt their answer to fit my need.
I've looked here to no avail:
kill a function after a certain time in windows
Python: kill or terminate subprocess when timeout
signal.alarm replacement in Windows [Python]
I am using multiprocessing to run multiple SAP windows at once to pull reports. The is set up to run on a schedule every 5 minutes. Every once in a while, one of the reports gets stalled due to the GUI interface and never ends. I don't get an error or exception, it just stalls forever. What I would like is to have a timeout function that during this part of the code that is executed in SAP, if it takes longer than 4 minutes, it times out, closes SAP, skips the rest of the code, and waits for next scheduled report time.
I am using Windows Python 2.7
import multiprocessing
from multiprocessing import Manager, Process
import time
import datetime
### OPEN SAP ###
def start_SAP():
print 'opening SAP program'
### REPORTS IN SAP ###
def report_1(q, lock):
while True: # logic to get shared queue
if not q.empty():
lock.acquire()
k = q.get()
time.sleep(1)
lock.release()
break
else:
time.sleep(1)
print 'running report 1'
def report_2(q, lock):
while True: # logic to get shared queue
if not q.empty():
lock.acquire()
k = q.get()
time.sleep(1)
lock.release()
break
else:
time.sleep(1)
print 'running report 2'
def report_3(q, lock):
while True: # logic to get shared queue
if not q.empty():
lock.acquire()
k = q.get()
time.sleep(1)
lock.release()
break
else:
time.sleep(1)
time.sleep(60000) #mimicking the stall for report 3 that takes longer than allotted time
print 'running report 3'
def report_N(q, lock):
while True: # logic to get shared queue
if not q.empty():
lock.acquire()
k = q.get()
time.sleep(1)
lock.release()
break
else:
time.sleep(1)
print 'running report N'
### CLOSES SAP ###
def close_SAP():
print 'closes SAP'
def format_file():
print 'formatting files'
def multi_daily_pull():
lock = multiprocessing.Lock() # creating a lock in multiprocessing
shared_list = range(6) # creating a shared list for all functions to use
q = multiprocessing.Queue() # creating an empty queue in mulitprocessing
for n in shared_list: # putting list into the queue
q.put(n)
print 'Starting process at ', time.strftime('%m/%d/%Y %H:%M:%S')
print 'Starting SAP Pulls at ', time.strftime('%m/%d/%Y %H:%M:%S')
StartSAP = Process(target=start_SAP)
StartSAP.start()
StartSAP.join()
report1= Process(target=report_1, args=(q, lock))
report2= Process(target=report_2, args=(q, lock))
report3= Process(target=report_3, args=(q, lock))
reportN= Process(target=report_N, args=(q, lock))
report1.start()
report2.start()
report3.start()
reportN.start()
report1.join()
report2.join()
report3.join()
reportN.join()
EndSAP = Process(target=close_SAP)
EndSAP.start()
EndSAP.join()
formatfile = Process(target=format_file)
formatfile .start()
formatfile .join()
if __name__ == '__main__':
multi_daily_pull()
One way to do what you want would be to use the optional timeout argument that the Process.join() method accepts. This will make it only block the calling thread at most that length of time.
I also set the daemon attribute of each Process instance so your main thread will be able to terminate even if one of the processes it started is still "running" (or has hung up).
One final point, you don't need a multiprocessing.Lock to control access a multiprocessing.Queue, because they handle that aspect of things automatically, so I removed it. You may still want to have one for some other reason, such as controlling access to stdout so printing to it from the various processes doesn't overlap and mess up what is output to the screen.
import multiprocessing
from multiprocessing import Process
import time
import datetime
def start_SAP():
print 'opening SAP program'
### REPORTS IN SAP ###
def report_1(q):
while True: # logic to get shared queue
if q.empty():
time.sleep(1)
else:
k = q.get()
time.sleep(1)
break
print 'report 1 finished'
def report_2(q):
while True: # logic to get shared queue
if q.empty():
time.sleep(1)
else:
k = q.get()
time.sleep(1)
break
print 'report 2 finished'
def report_3(q):
while True: # logic to get shared queue
if q.empty():
time.sleep(1)
else:
k = q.get()
time.sleep(60000) # Take longer than allotted time
break
print 'report 3 finished'
def report_N(q):
while True: # logic to get shared queue
if q.empty():
time.sleep(1)
else:
k = q.get()
time.sleep(1)
break
print 'report N finished'
def close_SAP():
print 'closing SAP'
def format_file():
print 'formatting files'
def multi_daily_pull():
shared_list = range(6) # creating a shared list for all functions to use
q = multiprocessing.Queue() # creating an empty queue in mulitprocessing
for n in shared_list: # putting list into the queue
q.put(n)
print 'Starting process at ', time.strftime('%m/%d/%Y %H:%M:%S')
print 'Starting SAP Pulls at ', time.strftime('%m/%d/%Y %H:%M:%S')
StartSAP = Process(target=start_SAP)
StartSAP.start()
StartSAP.join()
report1 = Process(target=report_1, args=(q,))
report1.daemon = True
report2 = Process(target=report_2, args=(q,))
report2.daemon = True
report3 = Process(target=report_3, args=(q,))
report3.daemon = True
reportN = Process(target=report_N, args=(q,))
reportN.daemon = True
report1.start()
report2.start()
report3.start()
reportN.start()
report1.join(30)
report2.join(30)
report3.join(30)
reportN.join(30)
EndSAP = Process(target=close_SAP)
EndSAP.start()
EndSAP.join()
formatfile = Process(target=format_file)
formatfile .start()
formatfile .join()
if __name__ == '__main__':
multi_daily_pull()
I tried to benchmark the speed up of Pipe over Queue from the multiprocessing package. T thought Pipe would be faster as Queue uses Pipe internally.
Strangely, Pipe is slower than Queue when sending large numpy array. What am I missing here?
Pipe:
import sys
import time
from multiprocessing import Process, Pipe
import numpy as np
NUM = 1000
def worker(conn):
for task_nbr in range(NUM):
conn.send(np.random.rand(400, 400, 3))
sys.exit(1)
def main():
parent_conn, child_conn = Pipe(duplex=False)
Process(target=worker, args=(child_conn,)).start()
for num in range(NUM):
message = parent_conn.recv()
if __name__ == "__main__":
start_time = time.time()
main()
end_time = time.time()
duration = end_time - start_time
msg_per_sec = NUM / duration
print "Duration: %s" % duration
print "Messages Per Second: %s" % msg_per_sec
# Took 10.86s.
Queue
import sys
import time
from multiprocessing import Process
from multiprocessing import Queue
import numpy as np
NUM = 1000
def worker(q):
for task_nbr in range(NUM):
q.put(np.random.rand(400, 400, 3))
sys.exit(1)
def main():
recv_q = Queue()
Process(target=worker, args=(recv_q,)).start()
for num in range(NUM):
message = recv_q.get()
if __name__ == "__main__":
start_time = time.time()
main()
end_time = time.time()
duration = end_time - start_time
msg_per_sec = NUM / duration
print "Duration: %s" % duration
print "Messages Per Second: %s" % msg_per_sec
# Took 6.86s.
You can do an experiment and put the following into your Pipe code above..
def worker(conn):
for task_nbr in range(NUM):
data = np.random.rand(400, 400, 3)
sys.exit(1)
def main():
parent_conn, child_conn = Pipe(duplex=False)
p = Process(target=worker, args=(child_conn,))
p.start()
p.join()
This gives you the time that it takes to create the data for your test. On my system this takes about 2.9 seconds.
Under the hood the queue object implements a buffer and a threaded send. The thread is still in the same process but by using it, the data creation doesn't have to wait for the system IO to complete. It effectively parallelizes the operations. Try your Pipe code modified with some simple threading implemented (disclaimer, code here is for test only and is not production ready)..
import sys
import time
import threading
from multiprocessing import Process, Pipe, Lock
import numpy as np
import copy
NUM = 1000
def worker(conn):
_conn = conn
_buf = []
_wlock = Lock()
_sentinel = object() # signal that we're done
def thread_worker():
while 1:
if _buf:
_wlock.acquire()
obj = _buf.pop(0)
if obj is _sentinel: return
_conn.send(data)
_wlock.release()
t = threading.Thread(target=thread_worker)
t.start()
for task_nbr in range(NUM):
data = np.random.rand(400, 400, 3)
data[0][0][0] = task_nbr # just for integrity check
_wlock.acquire()
_buf.append(data)
_wlock.release()
_wlock.acquire()
_buf.append(_sentinel)
_wlock.release()
t.join()
sys.exit(1)
def main():
parent_conn, child_conn = Pipe(duplex=False)
Process(target=worker, args=(child_conn,)).start()
for num in range(NUM):
message = parent_conn.recv()
assert num == message[0][0][0], 'Data was corrupted'
if __name__ == "__main__":
start_time = time.time()
main()
end_time = time.time()
duration = end_time - start_time
msg_per_sec = NUM / duration
print "Duration: %s" % duration
print "Messages Per Second: %s" % msg_per_sec
On my machine this takes 3.4 seconds to run which is almost exactly the same as your Queue code above.
From https://docs.python.org/2/library/threading.html
In Cython, due to due to the Global Interpreter Lock, only one thread can execute Python code at once... however, threading is still an appropriate model if you want to run multiple I/O-bound tasks simultaneously.
The queue and pipe differences are definitely an odd implementation detail until you dig into it a bit.
I assume by your print command you are using Python2. However the strange behavior cannot be replicated with Python3, where Pipe is actually faster than Queue.
import sys
import time
from multiprocessing import Process, Pipe, Queue
import numpy as np
NUM = 20000
def worker_pipe(conn):
for task_nbr in range(NUM):
conn.send(np.random.rand(40, 40, 3))
sys.exit(1)
def main_pipe():
parent_conn, child_conn = Pipe(duplex=False)
Process(target=worker_pipe, args=(child_conn,)).start()
for num in range(NUM):
message = parent_conn.recv()
def pipe_test():
start_time = time.time()
main_pipe()
end_time = time.time()
duration = end_time - start_time
msg_per_sec = NUM / duration
print("Pipe")
print("Duration: " + str(duration))
print("Messages Per Second: " + str(msg_per_sec))
def worker_queue(q):
for task_nbr in range(NUM):
q.put(np.random.rand(40, 40, 3))
sys.exit(1)
def main_queue():
recv_q = Queue()
Process(target=worker_queue, args=(recv_q,)).start()
for num in range(NUM):
message = recv_q.get()
def queue_test():
start_time = time.time()
main_queue()
end_time = time.time()
duration = end_time - start_time
msg_per_sec = NUM / duration
print("Queue")
print("Duration: " + str(duration))
print("Messages Per Second: " + str(msg_per_sec))
if __name__ == "__main__":
for i in range(2):
queue_test()
pipe_test()
Results in:
Queue
Duration: 3.44321894646
Messages Per Second: 5808.51822408
Pipe
Duration: 2.69065594673
Messages Per Second: 7433.13169575
Queue
Duration: 3.45295906067
Messages Per Second: 5792.13354361
Pipe
Duration: 2.78426194191
Messages Per Second: 7183.23218766
------------------
(program exited with code: 0)
Press return to continue
On my system Pipe(duplex=False) is slower (twice the time, or half the rate) than Pipe(duplex=True). For anyone looking for performance here is a side-by-side comparison:
from time import time
from multiprocessing import Process, Queue, Pipe
n = 1000
buffer = b'\0' * (1000*1000) # 1 megabyte
def print_elapsed(name, start):
elapsed = time() - start
spi = elapsed / n
ips = n / elapsed
print(f'{name}: {spi*1000:.3f} ms/item, {ips:.0f} item/sec')
def producer(q):
start = time()
for i in range(n):
q.put(buffer)
print_elapsed('producer', start)
def consumer(q):
start = time()
for i in range(n):
out = q.get()
print_elapsed('consumer', start)
class PipeQueue():
def __init__(self, **kwargs):
self.out_pipe, self.in_pipe = Pipe(**kwargs)
def put(self, item):
self.in_pipe.send_bytes(item)
def get(self):
return self.out_pipe.recv_bytes()
def close(self):
self.out_pipe.close()
self.in_pipe.close()
print('duplex=True')
q = PipeQueue(duplex=True)
producer_process = Process(target=producer, args=(q,))
consumer_process = Process(target=consumer, args=(q,))
consumer_process.start()
producer_process.start()
consumer_process.join()
producer_process.join()
q.close()
print('duplex=False')
q = PipeQueue(duplex=False)
producer_process = Process(target=producer, args=(q,))
consumer_process = Process(target=consumer, args=(q,))
consumer_process.start()
producer_process.start()
consumer_process.join()
producer_process.join()
q.close()
Results:
duplex=True
consumer: 0.301 ms/item, 3317 item/sec
producer: 0.298 ms/item, 3358 item/sec
duplex=False
consumer: 0.673 ms/item, 1486 item/sec
producer: 0.669 ms/item, 1494 item/sec
I think this must come down to CPython using os.pipe vs socket.socketpair, but I'm not sure.
I tried to run the following codes:
import multiprocessing
import time
def init_queue():
print("init g_queue start")
while not g_queue.empty():
g_queue.get()
for _index in range(10):
g_queue.put(_index)
print("init g_queue end")
return
def task_io(task_id):
print("IOTask[%s] start" % task_id)
print("the size of queue is %s" % g_queue.qsize())
while not g_queue.empty():
time.sleep(1)
try:
data = g_queue.get(block=True, timeout=1)
print("IOTask[%s] get data: %s" % (task_id, data))
except Exception as excep:
print("IOTask[%s] error: %s" % (task_id, str(excep)))
print("IOTask[%s] end" % task_id)
return
g_queue = multiprocessing.Queue()
if __name__ == '__main__':
print("the size of queue is %s" % g_queue.qsize())
init_queue()
print("the size of queue is %s" % g_queue.qsize())
time_0 = time.time()
process_list = [multiprocessing.Process(target=task_io, args=(i,)) for i in range(multiprocessing.cpu_count())]
for p in process_list:
p.start()
for p in process_list:
if p.is_alive():
p.join()
print("End:", time.time() - time_0, "\n")
what I got was the following:
the size of queue is 0
init g_queue start
init g_queue end
the size of queue is 10
IOTask[0] start
the size of queue is 0
IOTask[0] end
IOTask[1] start
the size of queue is 0
IOTask[1] end
('End:', 0.6480000019073486, '\n')
What I was expecting was
IOTask[0] start
the size of queue is 10
Because after initialization of g_queue, the size of queue was supposed to be 10, not 0. It seems like the queue is not in the shared memory. When the sub process starts, a copy of g_queue is created and its size is 0.
Why multiprocessing.queue is not in the shared memory? Please advise. Many thanks!
You should pass your g_queue as a parameter, then it will work.
demo for using multiprocessing with queue
import multiprocessing
import time
def long_time_calculate(n, result_queue):
time.sleep(1)
result_queue.put(n)
if __name__ == '__main__':
result_queue = multiprocessing.Queue()
pool_size = multiprocessing.cpu_count() * 2
pool = multiprocessing.Pool(processes=pool_size, maxtasksperchild=4)
manager = multiprocessing.Manager()
result_queue = manager.Queue()
inputs = [(1, result_queue), (2, result_queue), (3, result_queue), (4, result_queue)]
for input in inputs:
pool.apply_async(long_time_calculate, input)
pool.close()
pool.join()
print(list(result_queue.get() for _ in inputs))