Python Threading: Multiline Progress Report - python

I'm writing this program that downloads multiple files simultaneously using threads and reports the progress for each file on the screen.
I'm using one different thread to download each file. That is no problem:
for file_url, filename in zip(file_urls, filenames):
th = Thread(target=download_file, args=(file_url, filename))
threads.append(th)
th.start()
for thread in threads:
thread.join()
The problem is I don't know anyways that I can print each file progress report on the screen that will look like this:
"Downloading 'example1.zip': 54366 bytes of 2240799 70.7%"
"Downloading 'example2.zip': 31712 bytes of 1924639 64.7%"
"Downloading 'example3.zip': 21712 bytes of 3224979 34.7%"
The following snippet is for a single line progress report:
def chunk_report(bytes_so_far, total_size, filename):
percent = float(bytes_so_far) / total_size
percent = round(percent * 100, 2)
print "Downloading '{0}': {1} of {2} {3:3.2g}% \r".format(filename,
bytes_so_far, total_size, percent),
and the output would look like this:
"Downloading 'example2.zip': 31712 bytes of 1924639 64.7%"
each time a thread call this function it updates the screen for the file that, the thread is downloading.
So, the question is how do I print multiline progress report like the one that I illustrated above in python?
Thanks in advance.

I would use Queue to report progress to a reporting thread:
Create a Queue
Spawn each downloading thread passing the Queue as an argument
Have each downloading put progress messages to the Queue
Spawn a reporting thread which reads progress messages from the Queue and updates the display
A simulated example:
import threading
import time
import random
import Queue
import sys
# a downloading thread
def worker(path, total, q):
size = 0
while size < total:
dt = random.randint(1,3)
time.sleep(dt)
ds = random.randint(1,5)
size = size + ds
if size > total: size = total
q.put(("update", path, total, size))
q.put(("done", path))
# the reporting thread
def reporter(q, nworkers):
status = {}
while nworkers > 0:
msg = q.get()
if msg[0] == "update":
path, total, size = msg[1:]
status[path] = (total, size)
# update the screen here
show_progress(status)
elif msg[0] == "done":
nworkers = nworkers - 1
print ""
def show_progress(status):
line = ""
for path in status:
(total, size) = status[path]
line = line + "%s: %3d/%d " % (path, size,total)
sys.stdout.write("\r"+line)
sys.stdout.flush()
def main():
q = Queue.Queue()
w1 = threading.Thread(target = worker, args = ("abc", 30, q) )
w2 = threading.Thread(target = worker, args = ("foobar", 25, q))
w3 = threading.Thread(target = worker, args = ("bazquux", 16, q))
r = threading.Thread(target = reporter, args = (q, 3))
for t in [w1,w2,w3,r]: t.start()
for t in [w1,w2,w3,r]: t.join()
main()

Related

Multiprocessing not spawning all the requested processes

I'm working with Orcaflex (a FEM software for offshore analysis, but should not be relevant). I created a script to check if the simulations I've performed have been completed successfully (The simulation can fail for not reaching convergence). Since I'm talking about thousands of files I was trying to parallelize the process with multiprocessing. Following, my code. Sorry but I can't produce a working example for you, but I'll try to explain in detail. I created a derived Class of multiprocessing.Process and overwrite the run() to perform the checks on the simulations files.
Then, in __main__ I set a number of processors, split the files accordingly, and start the execution.
The problem is that the processes are not spawning altogether but, in what appear to be, a random amount of time from one to another. Is this what it is supposed to be or am I missing something?
What I mean by not spawning altogether is that I see:
[Info/Worker-1] child process calling self.run()
and for example:
[Info/Worker-4] child process calling self.run()
after about 10 min of the program running.
Thanks in advance for any help/suggetsion.
import os
import subprocess
import glob
import multiprocessing
import logging
import sys
import OrcFxAPI as of
class Worker(multiprocessing.Process):
myJobs = []
def setJobs(self, jobList):
self.myJobs = jobList
#staticmethod
def changedExtensionFileName(oldFileName, newExtension):
return '.'.join((os.path.splitext(oldFileName)[0], newExtension))
def run(self):
failed = []
model = of.Model(threadCount=1)
for job in self.myJobs:
try:
print('%s starting' % job)
sys.stdout.flush()
model.LoadSimulation(job)
if model.state == of.ModelState.SimulationStoppedUnstable:
newJob = job.replace('.sim', '.dat')
failed.append(newJob)
with open('Failed_Sim.txt', 'a') as f:
f.write(f'{newJob}\n')
f.close()
model.LoadData(newJob)
model.general.ImplicitConstantTimeStep /= 2
model.SaveData(newJob)
print(f'{job} has failed, reducing time step')
except of.DLLError as err:
print('%s ERROR: %s' % (job, err))
sys.stdout.flush()
with open(self.changedExtensionFileName(job, 'FAIL'), 'w') as f:
f.write('%s error: %s' % (job, err))
f.close()
return
if __name__ == '__main__':
import re
sim_file = [f for f in os.listdir() if re.search(r'\d\d\d\d.*.sim', f)]
# begin multprocessing
multiprocessing.log_to_stderr()
logger = multiprocessing.get_logger()
logger.setLevel(logging.INFO)
corecount = 14
workers = []
chunkSize = int(len(sim_file) / corecount)
chunkRemainder = int(len(sim_file) % corecount)
print('%s jobs found, dividing across %s workers - %s each remainder %s' % (str(len(sim_file)), str(corecount), chunkSize, chunkRemainder))
start = 0
for coreNum in range(0, corecount):
worker = Worker()
workers.append(worker)
end = start + chunkSize
if chunkRemainder>0:
chunkRemainder -= 1
end += 1
if end>len(sim_file):
end = len(sim_file)
worker.setJobs(sim_file[start:end])
worker.start()
start = end
if start>=len(sim_file):
break
for worker in workers:
worker.join()
print('Done...')
OK, so no one put up their hand to answer this by minor tweak (which I don't know how to do!), so here comes the larger rejig proposal...
def worker(inpData):
#The worker process
failed1 = []
failed2 = []
for job in inpData: #I'm not sure of the data shape of the chunks, has your original method split them into coherent chunks capable of being processed independently? My step here could be wrong.
try:
#print('%s starting' % job) #Prints won't appear on console from worker processes from windows, so commented them all out
model.LoadSimulation(job)
if model.state == of.ModelState.SimulationStoppedUnstable:
newJob = job.replace('.sim', '.dat')
failed1.append(newJob)
#I'd recommend we pass the list "failed" back to master and write to text from there, otherwise you could have several processes updating the text file at once, leading to possible loss of data
#with open('Failed_Sim.txt', 'a') as f:
# f.write(f'{newJob}\n')
# f.close()
model.LoadData(newJob)
model.general.ImplicitConstantTimeStep /= 2
model.SaveData(newJob)
#print(f'{job} has failed, reducing time step')
except of.DLLError as err:
#print('%s ERROR: %s' % (job, err))
#sys.stdout.flush()
#with open(self.changedExtensionFileName(job, 'FAIL'), 'w') as f:
# f.write('%s error: %s' % (job, err))
# f.close()
failed2.append(job)
#Note I've made two failed lists to pass back, for both failure types
return failed1, failed2
if __name__ == "__main__":
import re
import multiprocessing as mp
nCPUs = mp.cpu_count()
sim_file = [f for f in os.listdir() if re.search(r'\d\d\d\d.*.sim', f)]
#Make the chunks
chunkSize = int(len(sim_file) / corecount)
chunkRemainder = int(len(sim_file) % corecount)
print('%s jobs found, dividing across %s workers - %s each remainder %s' % (str(len(sim_file)), str(corecount), chunkSize, chunkRemainder))
chunks = []
start = 0
for iChunk in range(0, nCPUs)
end = start + chunkSize
if chunkRemainder>0:
chunkRemainder -= 1
end += 1
if end>len(sim_file):
end = len(sim_file)
chunk.append(sim_file[start:end])
#Send to workers
pool = mp.Pool(processes=nCPUs)
futA = []
for iChunk in range(0, nCPUs):
futA.append(pool.apply_async(worker, args=(chunk[iChunk],))
#Gather results
if futA:
failedDat = []
failedSim = []
for iChunk in range(0, len(futA)):
resA, resB = futA[iChunk].get()
failedDat.extend(resA)
failedSim.extend(resB)
pool.close()
if failedDat:
print("Following jobs failed, reducing timesteps:")
print(failedDat)
if failedSim:
print("Following sims failed due to errors")
print(failedSim)

Process pool results without waiting for all tasks to finish

from multiprocessing import Pool
from functools import partial
from time import sleep
import random
import string
import uuid
import os
import glob
def task_a(param1, param2, mydata):
thread_id = str(uuid.uuid4().hex) # this may not be robust enough to guarantee no collisions, address
output_filename = ''.join([str(thread_id),'.txt'])
# part 1 - create output file for task_b to use
with open(output_filename, 'w') as outfile:
for line in mydata:
outfile.write(line)
# part 2 - do some extra stuff (whilst task_b is running)
sleep(5)
print('Task A finished')
return output_filename # not interested in return val
def task_b(expected_num_files):
processed_files = 0
while processed_files<expected_num_files:
print('I am task_b, waiting for {} files ({} so far)'.format(expected_num_files, processed_files))
path_to_search = ''
for filename in glob.iglob(path_to_search + '*.txt', recursive=True):
print('Got file : {}'.format(filename))
# would do something complicated here
os.rename(filename, filename+'.done')
processed_files+=1
sleep(10)
if __name__ == '__main__':
param1 = '' # dummy variable, need to support in solution
param2 = '' # dummy variable, need to support in solution
num_workers = 2
full_data = [[random.choice(string.ascii_lowercase) for _ in range(5)] for _ in range(100)]
print(full_data)
for i in range(0, len(full_data), num_workers):
print('Going to process {}'.format(full_data[i:i+num_workers]))
p = Pool(num_workers)
task_a_func = partial(task_a, param1, param2)
results = p.map(task_a_func, full_data[i:i+num_workers])
p.close()
p.join()
task_b(expected_num_files=num_workers) # want this running sooner
print('Iteration {} complete'.format(i))
#want to wait for task_a's and task_b to finish
I'm having trouble scheduling these tasks to run concurrently.
task_a is a multiprocessing pool that produces an output file part way through it execution.
task_b MUST process the output files sequentially can be in any order (can be as soon as they are available), WHILST task_a continues to run (it will no longer change the output file)
The next iteration must only start when both all task_a's have completed AND task_b has completed.
The toy code I have posted obviously waits for task_a's to fully complete before task_b is started (which is not what I want)
I have looked at multiprocessing / subprocess etc. but cannot find a way to launch both the pool and the single task_b process concurrently AND wait for BOTH to finish.
task_b is written as if it could be changed to an external script, but I am still stuck on how manage the execution.
Should I effectively merge code from task_b into task_a and somehow pass a flag to ensure one worker per pool 'runs the task_b code' via a if/else - at least then I would just be waiting on the pool to complete?
You can use an interprocess queue to communicate the filenames between task a and task b.
Also, initializing pool repeatedly inside the loop is harmful and unnecessarily slow.
Its better to initialize the pool once in the beginning.
from multiprocessing import Pool, Manager, Event
from functools import partial
from time import sleep
import random
import string
import uuid
import os
import glob
def task_a(param1, param2, queue, mydata):
thread_id = str(uuid.uuid4().hex)
output_filename = ''.join([str(thread_id),'.txt'])
output_filename = 'data/' + output_filename
with open(output_filename, 'w') as outfile:
for line in mydata:
outfile.write(line)
print(f'{thread_id}: Task A file write complete for data {mydata}')
queue.put(output_filename)
print('Task A finished')
def task_b(queue, num_workers, data_size, event_task_b_done):
print('Task b started!')
processed_files = 0
while True:
filename = queue.get()
if filename == 'QUIT':
# Whenever you want task_b to quit, just push 'quit' to the queue
print('Task b quitting')
break
print('Got file : {}'.format(filename))
os.rename(filename, filename+'.done')
processed_files+=1
print(f'Have processed {processed_files} so far!')
if (processed_files % num_workers == 0) or (processed_files == data_size):
event_task_b_done.set()
if __name__ == '__main__':
param1 = '' # dummy variable, need to support in solution
param2 = '' # dummy variable, need to support in solution
num_workers = 2
data_size = 100
full_data = [[random.choice(string.ascii_lowercase) for _ in range(5)] for _ in range(data_size)]
mgr = Manager()
queue = mgr.Queue()
event_task_b_done = mgr.Event()
# One extra worker for task b
p = Pool(num_workers + 1)
p.apply_async(task_b, args=(queue, num_workers, data_size, event_task_b_done))
task_a_func = partial(task_a, param1, param2, queue)
for i in range(0, len(full_data), num_workers):
data = full_data[i:i+num_workers]
print('Going to process {}'.format(data))
p.map_async(task_a_func, full_data[i:i+num_workers])
print(f'Waiting for task b to process all {num_workers} files...')
event_task_b_done.wait()
event_task_b_done.clear()
print('Iteration {} complete'.format(i))
queue.put('QUIT')
p.close()
p.join()
exit(0)

how to use threads to grab multiple chunks of a file concurrently from server but write to disk atomically?

I am stuck in a grieve problem, and not able to figure out which way to go, in attempts made whole day I have posted so many times, this is not a duplicate question, since I need clarity how can I use multiple threads to grab a multiple chunks simultaneously from a server but write to disk atomically by locking the file write operation for single thread access; while the next thread waits for lock to get released.
import argparse,logging, Queue, os, requests, signal, sys, time, threading
import utils as _fdUtils
DESKTOP_PATH = os.path.expanduser("~/Desktop")
appName = 'FileDownloader'
logFile = os.path.join(DESKTOP_PATH, '%s.log' % appName)
_log = _fdUtils.fdLogger(appName, logFile, logging.DEBUG, logging.DEBUG, console_level=logging.DEBUG)
queue = Queue.Queue()
STOP_REQUEST = threading.Event()
maxSplits = threading.BoundedSemaphore(3)
threadLimiter = threading.BoundedSemaphore(5)
lock = threading.Lock()
Update:1
def _grabAndWriteToDisk(url, saveTo, first=None, queue=None, mode='wb', irange=None):
""" Function to download file..
Args:
url(str): url of file to download
saveTo(str): path where to save file
first(int): starting byte of the range
queue(Queue.Queue): queue object to set status for file download
mode(str): mode of file to be downloaded
irange(str): range of byte to download
"""
fileName = url.split('/')[-1]
filePath = os.path.join(saveTo, fileName)
fileSize = int(_fdUtils.getUrlSizeInBytes(url))
downloadedFileSize = 0 if not first else first
block_sz = 8192
irange = irange if irange else '0-%s' % fileSize
# print mode
resp = requests.get(url, headers={'Range': 'bytes=%s' % irange}, stream=True)
fileBuffer = resp.raw.read()
with open(filePath, mode) as fd:
downloadedFileSize += len(fileBuffer)
fd.write(fileBuffer)
status = r"%10d [%3.2f%%]" % (downloadedFileSize, downloadedFileSize * 100. / fileSize)
status = status + chr(8)*(len(status)+1)
sys.stdout.write('%s\r' % status)
time.sleep(.05)
sys.stdout.flush()
if downloadedFileSize == fileSize:
STOP_REQUEST.set()
if queue:
queue.task_done()
_log.info("Download Completed %s%% for file %s, saved to %s",
downloadedFileSize * 100. / fileSize, fileName, saveTo)
class ThreadedFetch(threading.Thread):
""" docstring for ThreadedFetch
"""
def __init__(self, queue):
super(ThreadedFetch, self).__init__()
self.queue = queue
self.lock = threading.Lock()
def run(self):
threadLimiter.acquire()
try:
items = self.queue.get()
url = items[0]
saveTo = DESKTOP_PATH if not items[1] else items[1]
split = items[-1]
# grab split chunks in separate thread.
if split > 1:
maxSplits.acquire()
try:
sizeInBytes = int(_fdUtils.getUrlSizeInBytes(url))
byteRanges = _fdUtils.getRange(sizeInBytes, split)
mode = 'wb'
th = threading.Thread(target=_grabAndWriteToDisk, args=(url, saveTo, first, self.queue, mode, _range))
_log.info("Pulling for range %s using %s" , _range, th.getName())
th.start()
# _grabAndWriteToDisk(url, saveTo, first, self.queue, mode, _range)
mode = 'a'
finally:
maxSplits.release()
else:
while not STOP_REQUEST.isSet():
self.setName("primary_%s" % url.split('/')[-1])
# if downlaod whole file in single chunk no need
# to start a new thread, so directly download here.
_grabAndWriteToDisk(url, saveTo, 0, self.queue)
finally:
threadLimiter.release()
def main(appName, flag='with'):
args = _fdUtils.getParser()
urls_saveTo = {}
if flag == 'with':
_fdUtils.Watcher()
elif flag != 'without':
_log.info('unrecognized flag: %s', flag)
sys.exit()
# spawn a pool of threads, and pass them queue instance
# each url will be downloaded concurrently
for i in xrange(len(args.urls)):
t = ThreadedFetch(queue)
t.daemon = True
t.start()
split = 4
try:
for url in args.urls:
# TODO: put split as value of url as tuple with saveTo
urls_saveTo[url] = args.saveTo
# populate queue with data
for url, saveTo in urls_saveTo.iteritems():
queue.put((url, saveTo, split))
# wait on the queue until everything has been processed
queue.join()
_log.info('Finsihed all dowonloads.')
except (KeyboardInterrupt, SystemExit):
_log.critical('! Received keyboard interrupt, quitting threads.')
I expect to multiple threads grab each chunk but in the terminal i see same thread grabbing each range of chunk:
INFO - Pulling for range 0-25583 using Thread-1
INFO - Pulling for range 25584-51166 using Thread-1
INFO - Pulling for range 51167-76748 using Thread-1
INFO - Pulling for range 76749-102331 using Thread-1
INFO - Download Completed 100.0% for file 607800main_kepler1200_1600-1200.jpg
but what I am expecting is
INFO - Pulling for range 0-25583 using Thread-1
INFO - Pulling for range 25584-51166 using Thread-2
INFO - Pulling for range 51167-76748 using Thread-3
INFO - Pulling for range 76749-102331 using Thread-4
Please do not mark duplicate, without understanding...
please recommend if there is a better approach of doing what I am trying to do..
Cheers:/
If you want the download happen simultaneously and just writing to the file to be atomic then don't put the lock around downloading and writing to the file, but just around the write part.
As every thread uses its own file object to write I don't see the need to lock that access anyway. What you have to make sure is every thread writes to the correct offset, so you need a seek() call on the file before writing the data chunk. Otherwise you'll have to write the chunks in file order, which would make things more complicated.

Multiprocessing, writing to file, and deadlock on large loops

I have a very weird problem with the code below. when numrows = 10 the Process loops completes itself and proceeds to finish. If the growing list becomes larger it goes into a deadlock. Why is this and how can I solve this?
import multiprocessing, time, sys
# ----------------- Calculation Engine -------------------
def feed(queue, parlist):
for par in parlist:
queue.put(par)
def calc(queueIn, queueOut):
while True:
try:
par = queueIn.get(block = False)
print "Project ID: %s started. " % par
res = doCalculation(par)
queueOut.put(res)
except:
break
def write(queue, fname):
print 'Started to write to file'
fhandle = open(fname, "w")
while True:
try:
res = queue.get(block = False)
for m in res:
print >>fhandle, m
except:
break
fhandle.close()
print 'Complete writing to the file'
def doCalculation(project_ID):
numrows = 100
toFileRowList = []
for i in range(numrows):
toFileRowList.append([project_ID]*100)
print "%s %s" % (multiprocessing.current_process().name, i)
return toFileRowList
def main():
parlist = [276, 266]
nthreads = multiprocessing.cpu_count()
workerQueue = multiprocessing.Queue()
writerQueue = multiprocessing.Queue()
feedProc = multiprocessing.Process(target = feed , args = (workerQueue, parlist))
calcProc = [multiprocessing.Process(target = calc , args = (workerQueue, writerQueue)) for i in range(nthreads)]
writProc = multiprocessing.Process(target = write, args = (writerQueue, 'somefile.csv'))
feedProc.start()
feedProc.join ()
for p in calcProc:
p.start()
for p in calcProc:
p.join()
writProc.start()
writProc.join()
if __name__=='__main__':
sys.exit(main())
I think the problem is the Queue buffer getting filled, so you need to read from the queue before you can put additional stuff in it.
For example, in your feed thread you have:
queue.put(par)
If you keep putting much stuff without reading this will cause it to block untill the buffer is freed, but the problem is that you only free the buffer in your calc thread, which in turn doesn't get started before you join your blocking feed thread.
So, in order for your feed thread to finish, the buffer should be freed, but the buffer won't be freed before the thread finishes :)
Try organizing your queues access more.
The feedProc and the writeProc are not actually running in parallel with the rest of your program. When you have
proc.start()
proc.join ()
you start the process and then, on the join() you immediatly wait for it to finish. In this case there's no gain in multiprocessing, only overhead. Try to start ALL processes at once before you join them. This will also have the effect that your queues get emptied regularyl and you won't deadlock.

Python: multiple files download by turn

In script loop performs files downloading and saving (curl). But loop iterations too quick, so downloading and saving actions have no time to complete it's operations. Thereat result files comes broken
def get_images_thread(table):
class LoopThread ( threading.Thread ):
def run ( self ):
global db
c=db.cursor()
c.execute(""" SELECT * FROM js_stones ORDER BY stone_id LIMIT 1
""")
ec = EasyCurl(table)
while(1):
stone = c.fetchone()
if stone == None:
break
img_fname = stone[2]
print img_fname
url = "http://www.jstone.it/"+img_fname
fname = url.strip("/").split("/")[-1].strip()
ec.perform(url, filename="D:\\Var\\Python\\Jstone\\downloadeble_pictures\\"+fname,
progress=ec.textprogress)
This is an excerpt from the examples for the PycURL library,
# Make a queue with (url, filename) tuples
queue = Queue.Queue()
for url in urls:
url = url.strip()
if not url or url[0] == "#":
continue
filename = "doc_%03d.dat" % (len(queue.queue) + 1)
queue.put((url, filename))
# Check args
assert queue.queue, "no URLs given"
num_urls = len(queue.queue)
num_conn = min(num_conn, num_urls)
assert 1 <= num_conn <= 10000, "invalid number of concurrent connections"
print "PycURL %s (compiled against 0x%x)" % (pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM)
print "----- Getting", num_urls, "URLs using", num_conn, "connections -----"
class WorkerThread(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while 1:
try:
url, filename = self.queue.get_nowait()
except Queue.Empty:
raise SystemExit
fp = open(filename, "wb")
curl = pycurl.Curl()
curl.setopt(pycurl.URL, url)
curl.setopt(pycurl.FOLLOWLOCATION, 1)
curl.setopt(pycurl.MAXREDIRS, 5)
curl.setopt(pycurl.CONNECTTIMEOUT, 30)
curl.setopt(pycurl.TIMEOUT, 300)
curl.setopt(pycurl.NOSIGNAL, 1)
curl.setopt(pycurl.WRITEDATA, fp)
try:
curl.perform()
except:
import traceback
traceback.print_exc(file=sys.stderr)
sys.stderr.flush()
curl.close()
fp.close()
sys.stdout.write(".")
sys.stdout.flush()
# Start a bunch of threads
threads = []
for dummy in range(num_conn):
t = WorkerThread(queue)
t.start()
threads.append(t)
# Wait for all threads to finish
for thread in threads:
thread.join()
If you're asking what I think you're asking,
from time import sleep
sleep(1)
should "solve"(It's hacky to the max!) your problem. Docs here. I would check that that really is your problem, though. It seems catastrophically unlikely that pausing for a few seconds would stop files from downloading brokenly. Some more detail would be nice too.
os.waitpid()
might also help.

Categories