I'm using the threading module in python to do some tests on I/O bound processing.
Basically, I am simply reading a file, line by line and writing it out concurrently.
I put the reading and writing loops in separate threads and use a Queue to pass data between:
q = Queue()
rt = ReadThread(ds)
wt = WriteThread(outBand)
rt.start()
wt.start()
If I run it as above, it works fine, but the interpreter crashes at the end of execution. (Any ideas why?)
If I add:
rt.join()
wt.join()
at the end, the interpreter simply hangs. Any ideas why?
The code for the ReadThread and WriteThread classes is as follows:
class ReadThread(threading.Thread):
def __init__(self, ds):
threading.Thread.__init__(self)
self.ds = ds #The raster datasource to read from
def run(self):
reader(self.ds)
class WriteThread(threading.Thread):
def __init__(self, ds):
threading.Thread.__init__(self)
self.ds = ds #The raster datasource to write to
def run(self):
writer(self.ds)
def reader(ds):
"""Reads data from raster, starting with a chunk for three lines then removing/adding a row for the remainder"""
data = read_lines(ds)
q.put(data[1, :]) #add to the queue
for i in np.arange(3, ds.RasterYSize):
data = np.delete(data, 0, 0)
data = np.vstack([data, read_lines(ds, int(i), 1)])
q.put(data[1,:]) # put the relevant data on the queue
def writer(ds):
""" Writes data from the queue to a raster file """
i = 0
while True:
arr = q.get()
ds.WriteArray(np.atleast_2d(arr), xoff = 0, yoff = i)
i +=1
Call q.get() will block infinitely in case your Queue is empty.
You can try to use get_nowait(), but you have to make sure that by the time you get to the writer function, there is something in the Queue.
wt.join() waits for the thread to finish, which it never does because of the infinite loop around q.get() in writer. To make it finish, add
q.put(None)
as the last line of reader, and change writer to
def writer(ds):
""" Writes data from the queue to a raster file """
for i, arr in enumerate(iter(q.get, None)):
ds.WriteArray(np.atleast_2d(arr), xoff = 0, yoff = i)
iter(q.get, None) yields values from q until q.get returns None. I added enumerate just for the sake of simplifying the code further.
Related
I want to write good tests to make sure my concurrent data structure works. But the tests are passing even on a class that is obviously not thread-safe.
class NotThreadSafe:
def __init__(self):
self.set1 = set()
self.set2 = set()
def add_to_sets(self, item):
self._add_to_set1(item)
self._add_to_set2(item)
def _add_to_set1(self, item):
self.set1.add(item)
def _add_to_set2(self, item):
self.set2.add(item)
def are_sets_equal_length(self):
return len(self.set1) == len(self.set2)
My tests have a reader thread and a writer thread running concurrently. The writer thread calls add_to_sets and the reader thread calls are_sets_equal_length.
But the reader thread always observes are_sets_equal_length to be True, even though the writer thread should theoretically cause inequalities.
How can I add some time delay on add_to_set2 so that it forces the race condition to surface?
The test:
import threading
import time
def writer_fn(nts: NotThreadSafe):
for i in range(1000):
nts.add_to_sets(i)
def reader_fn(nts: NotThreadSafe, stop: list, results: list):
while not len(stop):
if not nts.are_sets_equal_length():
results.append(False)
return
results.append(True)
def test_nts():
nts = NotThreadSafe()
stop = []
results = []
reader = threading.Thread(target=reader_fn, args=[nts, stop, results])
writer = threading.Thread(target=writer_fn, args=[nts])
reader.start()
writer.start()
writer.join()
stop.append(True)
reader.join()
assert not results[0]
Step 1: write a wrapper that creates a new function containing a time delay.
def slow_wrapper(method):
"""Adds a tiny delay to a method. Good for triggering race conditions that would otherwise be very rare."""
def wrapped_method(*args):
time.sleep(0.001)
return method(*args)
return wrapped_method
Step 2: In the test function, after creating the object, change add_to_set2 into a slow version:
nts = NotThreadSafe()
# change _add_to_set2 into a time-delayed version
nts._add_to_set2 = slow_wrapper(nts._add_to_set2)
Step 3: Run the tests. The failure should be triggered properly.
I'm looking for shorter ways to prepare my dataset for a machine-learning task. I found that the multiprocessing library might helpful. However, because I'm a newbie in multiprocessing, I couldn't find a proper way.
I first wrote some codes like below:
class DatasetReader:
def __init__(self):
self.data_list = Read_Data_from_file
self.data = []
def _ready_data(self, ex, idx):
# Some complex functions that takes several minutes
def _dataset_creator(self, queue):
for idx, ex in enumerate(self.data_list):
queue.put(self._ready_data(ex, idx))
def _dataset_consumer(self, queue):
total_mem = 0.0
t = tqdm(range(self.num_data), total=self.num_data, desc='Building Dataset ', bar_format='{desc}:{percentage:3.0f}% ({n_fmt}/{total_fmt}) [{elapsed}<{remaining},{rate_fmt}{postfix}]')
for idx in t:
ins = queue.get()
self.data.append(ins)
gc.collect()
def _build_dataset(self):
queue = Queue()
creator = Process(target=self._dataset_creator, args=(queue,))
consumer = Process(target=self._dataset_consumer, args=(queue,))
creator.start()
consumer.start()
queue.close()
queue.join_thread()
creator.join()
consumer.join()
However, in my opinion, because the _dataset_creator processes data (here _ready_data) in serial manner, this would not be helpful for reducing time consumption.
So, I modified the code to generate multiple processes that process one datum:
class DatasetReader:
def __init__(self):
self.data_list = Read_Data_from_file
self.data = []
def _ready_data(self, ex, idx):
# Some complex functions that takes several minutes
def _dataset_creator(self, ex, idx, queue):
queue.put(self._ready_data(ex, idx))
def _dataset_consumer(self, queue):
total_mem = 0.0
t = tqdm(range(self.num_data), total=self.num_data, desc='Building Dataset ', bar_format='{desc}:{percentage:3.0f}% ({n_fmt}/{total_fmt}) [{elapsed}<{remaining},{rate_fmt}{postfix}]')
for idx in t:
ins = queue.get()
self.data.append(ins)
gc.collect()
def _build_dataset(self):
queue = Queue()
for idx, ex in enumerate(self.data_list):
p = Process(target=self._dataset_creator, args=(ex, idx, queue,))
p.start()
consumer = Process(target=self._dataset_consumer, args=(queue,))
consumer.start()
queue.close()
queue.join_thread()
consumer.join()
However, this returns me errors:
Process Process-18:
Traceback ~~~
RuntimeError: can't start new thread
Traceback ~~~
OSError: [Errno 12] Cannot allocate memory
Could you help me to process complex data in a parallel way?
EDIT 1:
Thanks to #tdelaney, I can reduce the time consumption by generating self.num_worker processes (16 in my experiment):
def _dataset_creator(self, pid, queue):
for idx, ex in list(enumerate(self.data_list))[pid::self.num_worker]:
queue.put(self._ready_data(ex, idx))
def _dataset_consumer(self, queue):
t = tqdm(range(self.num_data), total=self.num_data, desc='Building Dataset ', bar_format='{desc}:{percentage:3.0f}% ({n_fmt}/{total_fmt}) [{elapsed}<{remaining},{rate_fmt}{postfix}]')
for _ in t:
ins = queue.get()
self.data[ins['idx']] = ins
def _build_dataset(self):
queue = Queue()
procs = []
for pid in range(self.num_worker):
p = Process(target=self._dataset_creator, args=(pid, queue,))
procs.append(p)
p.start()
consumer = Process(target=self._dataset_consumer, args=(queue,))
consumer.start()
queue.close()
queue.join_thread()
for p in procs:
p.join()
consumer.join()
I'm trying to sketch out what a solution with a multiprocessing pool would look like. I got rid of the consumer process completely because it looks like the parent process is just waiting anyway (and needs the data eventually) so it can be the consumer. So, I set up a pool and use imap_unordered to handle passing the data to the worker.
I guessed that the data processing doesn't really need the DatasetReader at all and moved it out to its own function. On Windows, either the entire DataReader object is serialized to the subprocess (including data you don't want) or the child version of the object is incomplete and may crash when you try to use it.
Either way, changes made to a DatasetReader object in the child processes aren't seen in the parent. This can be unexpected if the parent is dependent on updated state in that object. Its best to severely bracket what's happening in subprocesses, in my opinion.
from multiprocessing import Pool, get_start_method, cpu_count
# moved out of class (assuming it is not class dependent) so that
# the entire DatasetReader object isn't pickled and sent to
# the child on spawning systems like Microsoft Windows
def _ready_data(idx_ex):
idx, ex = idx_ex
# Some complex functions that take several minutes
result = complex_functions(ex)
return (idx, result)
class DatasetReader:
def __init__(self):
self.data_list = Read_Data_from_file
self.data = [None] * len(data_list)
def _ready_data_fork(self, idx):
# on forking system, call worker with object data
return _ready_data((idx, self.data_list[idx]))
def run(self):
t = tqdm(range(self.num_data), total=self.num_data, desc='Building Dataset ',
bar_format='{desc}:{percentage:3.0f}% ({n_fmt}/{total_fmt}) '
'[{elapsed}<{remaining},{rate_fmt}{postfix}]')
pool = Pool(min(cpu_count, len(self.data_list)))
if get_start_method() == 'fork':
# on forking system, self.data_list is in child process and
# we only pass the index
result_iter = pool.imap_unordered(self._ready_data_fork,
(idx for idx in range(len(data_list))),
chunksize=1)
else:
# on spawning system, we need to pass the data
result_iter = pool.imap_unordered(_ready_data,
enumerate(self.data_list,
chunksize=1)
for idx, result in result_iter:
next(t)
self.data[idx] = result
pool.join()
I'm using python and OpenCV to get video from a rtsp stream. I'm getting single frames from the stream and saving them to the file system.
I wrote a StreamingWorker which handles frame getting and saving. Additionally there is a StreamPool that has all the streaming objects. I thought that as the StreamingWorker would always be running, there should only be one per core, in order to take as much as possible. Then the StreamPool would provide the VideoCapture objects to the available StreamingWorker.
The problem is that most of the time that the script is running, is blocking:
import os
import time
import threading
import cv2 as cv
class StreamingWorker(object):
def __init__(self, stream_pool):
self.stream_pool = stream_pool
self.start_loop()
def start_loop(self):
while True:
try:
# getting a stream from the read_strategy
stream_object = self.stream_pool.next()
# getting an image from the stream
_, frame = stream_object['stream'].read()
# saving image to file system
cv.imwrite(os.path.join('result', stream_object['feed'], '{}.jpg'.format(time.time())))
except ValueError as e:
print('[error] {}'.format(e))
class StreamPool(object):
def __init__(self, streams):
self.streams = [{'feed': stream, 'stream': cv.VideoCapture(stream)} for stream in streams]
self.current_stream = 0
self.lock = threading.RLock()
def next(self):
self.lock.acquire()
if(self.current_stream + 1 >= len(self.streams)):
self.current_stream = 0
else:
self.current_stream += 1
result = self.streams[self.current_stream]
self.lock.release()
return result
def get_cores():
# This function returns the number of available cores
import multiprocessing
return multiprocessing.cpu_count()
def start(stream_pool):
StreamingWorker(stream_pool)
def divide_list(input_list, amount):
# This function divides the whole list into list of lists
result = [[] for _ in range(amount)]
for i in range(len(input_list)):
result[i % len(result)].append(input_list[i])
return result
if __name__ == '__main__':
stream_list = ['rtsp://some/stream1', 'rtsp://some/stream2', 'rtsp://some/stream3']
num_cores = get_cores()
divided_streams = divide_list(stream_list, num_cores)
for streams in divided_streams:
stream_pool = StreamPool(streams)
thread = threading.Thread(target=start, args=(stream_pool))
thread.start()
When I thought of this, I didn't take into account that most of the operations will be blocking operations like:
# Getting a frame blocks
_, frame = stream_object['stream'].read()
# Writing to the file system blocks
cv.imwrite(os.path.join('result', stream_object['feed'], '{}.jpg'.format(time.time())))
The problem with spending too much time blocking is that most of the processing power is wasted. I thought of using futures with a ThreadPoolExecutor but I can't seem to reach my goal of using the maximum amount of processing cores possible. Maybe I'm not setting enaugh threads.
Is there a standard way of handling blocking operations, in order to make the best use of the cores' processing power? I'm fine having a language-agnostic answer.
I ended up using the ThreadPoolExecutor using the add_done_callback(fn) function.
class StreamingWorker(object):
def __init__(self, stream_pool):
self.stream_pool = stream_pool
self.thread_pool = ThreadPoolExecutor(10)
self.start_loop()
def start_loop(self):
def done(fn):
print('[info] future done')
def save_image(stream):
# getting an image from the stream
_, frame = stream['stream'].read()
# saving image to file system
cv.imwrite(os.path.join('result', stream['feed'], '{}.jpg'.format(time.time())))
while True:
try:
# getting a stream from the read_strategy
stream_object = self.stream_pool.next()
# Scheduling the process to the thread pool
self.thread_pool.submit(save_image, (stream_object)).add_done_callback(done)
except ValueError as e:
print('[error] {}'.format(e))
I didn't actually want to do anything after the future finished, but if I used result() then the while True would stop, which whould also defeat all the purpose of using the thread pool.
Side note: I had to add a threading.Rlock() when calling self.stream_pool.next() because apparently opencv can't handle calls from multiple threads.
I'm trying to share an existing object across multiple processing using the proxy methods described here. My multiprocessing idiom is the worker/queue setup, modeled after the 4th example here.
The code needs to do some calculations on data that are stored in rather large files on disk. I have a class that encapsulates all the I/O interactions, and once it has read a file from disk, it saves the data in memory for the next time a task needs to use the same data (which happens often).
I thought I had everything working from reading the examples linked to above. Here is a mock up of the code that just uses numpy random arrays to model the disk I/O:
import numpy
from multiprocessing import Process, Queue, current_process, Lock
from multiprocessing.managers import BaseManager
nfiles = 200
njobs = 1000
class BigFiles:
def __init__(self, nfiles):
# Start out with nothing read in.
self.data = [ None for i in range(nfiles) ]
# Use a lock to make sure only one process is reading from disk at a time.
self.lock = Lock()
def access(self, i):
# Get the data for a particular file
# In my real application, this function reads in files from disk.
# Here I mock it up with random numpy arrays.
if self.data[i] is None:
with self.lock:
self.data[i] = numpy.random.rand(1024,1024)
return self.data[i]
def summary(self):
return 'BigFiles: %d, %d Storing %d of %d files in memory'%(
id(self),id(self.data),
(len(self.data) - self.data.count(None)),
len(self.data) )
# I'm using a worker/queue setup for the multprocessing:
def worker(input, output):
proc = current_process().name
for job in iter(input.get, 'STOP'):
(big_files, i, ifile) = job
data = big_files.access(ifile)
# Do some calculations on the data
answer = numpy.var(data)
msg = '%s, job %d'%(proc, i)
msg += '\n Answer for file %d = %f'%(ifile, answer)
msg += '\n ' + big_files.summary()
output.put(msg)
# A class that returns an existing file when called.
# This is my attempted workaround for the fact that Manager.register needs a callable.
class ObjectGetter:
def __init__(self, obj):
self.obj = obj
def __call__(self):
return self.obj
def main():
# Prior to the place where I want to do the multprocessing,
# I already have a BigFiles object, which might have some data already read in.
# (Here I start it out empty.)
big_files = BigFiles(nfiles)
print 'Initial big_files.summary = ',big_files.summary()
# My attempt at making a proxy class to pass big_files to the workers
class BigFileManager(BaseManager):
pass
getter = ObjectGetter(big_files)
BigFileManager.register('big_files', callable = getter)
manager = BigFileManager()
manager.start()
# Set up the jobs:
task_queue = Queue()
for i in range(njobs):
ifile = numpy.random.randint(0, nfiles)
big_files_proxy = manager.big_files()
task_queue.put( (big_files_proxy, i, ifile) )
# Set up the workers
nproc = 12
done_queue = Queue()
process_list = []
for j in range(nproc):
p = Process(target=worker, args=(task_queue, done_queue))
p.start()
process_list.append(p)
task_queue.put('STOP')
# Log the results
for i in range(njobs):
msg = done_queue.get()
print msg
print 'Finished all jobs'
print 'big_files.summary = ',big_files.summary()
# Shut down the workers
for j in range(nproc):
process_list[j].join()
task_queue.close()
done_queue.close()
main()
This works in the sense that it calculates everything correctly, and it is caching the data that is read along the way. The only problem I'm having is that at the end, the big_files object doesn't have any of the files loaded. The final msg returned is:
Process-2, job 999. Answer for file 198 = 0.083406
BigFiles: 4303246400, 4314056248 Storing 198 of 200 files in memory
But then after it's all done, we have:
Finished all jobs
big_files.summary = BigFiles: 4303246400, 4314056248 Storing 0 of 200 files in memory
So my question is: What happened to all the stored data? It's claiming to be using the same self.data according to the id(self.data). But it's empty now.
I want the end state of big_files to have all the saved data that it accumulated along the way, since I actually have to repeat this entire process many times, so I don't want to have to redo all the (slow) I/O each time.
I'm assuming it must have something to do with my ObjectGetter class. The examples for using BaseManager only show how to make a new object that will be shared, not share an existing one. So am I doing something wrong with way I get the existing big_files object? Can anyone suggest a better way to do this step?
Thanks much!
So I run the code below, and when I use queue.qsize() after I run it, there are still 450,000 or so items in the queue, implying most lines of the text file were not read. Any idea what is going on here?
from Queue import Queue
from threading import Thread
lines = 660918 #int(str.split(os.popen('wc -l HGDP_FinalReport_Forward.txt').read())[0]) -1
queue = Queue()
File = 'HGDP_FinalReport_Forward.txt'
num_threads =10
short_file = open(File)
class worker(Thread):
def __init__(self,queue):
Thread.__init__(self)
self.queue = queue
def run(self):
while True:
try:
self.queue.get()
i = short_file.readline()
self.queue.task_done() #signal to the queue that the task is done
except:
break
## This is where I should make the call to the threads
def main():
for i in range(num_threads):
worker(queue).start()
queue.join()
for i in range(lines): # put the range of the number of lines in the .txt file
queue.put(i)
main()
It's hard to know exactly what you're trying to do here, but if each line can be processed independently, multiprocessing is a much simpler choice that will take care of all the synchronization for you. An added bonus is that you don't have to know the number of lines in advance.
Basically,
import multiprocessing
pool = multiprocessing.Pool(10)
def process(line):
return len(line) #or whatever
with open(path) as lines:
results = pool.map(process, lines)
Or, if you're just trying to get some kind of aggregate result from the lines, you can use reduce to lower memory usage.
import operator
with open(path) as lines:
result = reduce(operator.add, pool.map(process, lines))
so I tried doing this but I am getting a bit confused because I need to pass a single line each time, and that isn't what the code seems to be doing
import multiprocessing as mp
File = 'HGDP_FinalReport_Forward.txt'
#short_file = open(File)
test = []
def pro(temp_line):
temp_line = temp_line.strip().split()
return len(temp_line)
if __name__ == "__main__":
with open("HGDP_FinalReport_Forward.txt") as lines:
pool = mp.Pool(processes = 10)
t = pool.map(pro,lines)