I'm having the following problem in python.
I need to do some calculations in parallel whose results I need to be written sequentially in a file. So I created a function that receives a multiprocessing.Queue and a file handle, do the calculation and print the result in the file:
import multiprocessing
from multiprocessing import Process, Queue
from mySimulation import doCalculation
# doCalculation(pars) is a function I must run for many different sets of parameters and collect the results in a file
def work(queue, fh):
while True:
try:
parameter = queue.get(block = False)
result = doCalculation(parameter)
print >>fh, string
except:
break
if __name__ == "__main__":
nthreads = multiprocessing.cpu_count()
fh = open("foo", "w")
workQueue = Queue()
parList = # list of conditions for which I want to run doCalculation()
for x in parList:
workQueue.put(x)
processes = [Process(target = writefh, args = (workQueue, fh)) for i in range(nthreads)]
for p in processes:
p.start()
for p in processes:
p.join()
fh.close()
But the file ends up empty after the script runs. I tried to change the worker() function to:
def work(queue, filename):
while True:
try:
fh = open(filename, "a")
parameter = queue.get(block = False)
result = doCalculation(parameter)
print >>fh, string
fh.close()
except:
break
and pass the filename as parameter. Then it works as I intended. When I try to do the same thing sequentially, without multiprocessing, it also works normally.
Why it didn't worked in the first version? I can't see the problem.
Also: can I guarantee that two processes won't try to write the file simultaneously?
EDIT:
Thanks. I got it now. This is the working version:
import multiprocessing
from multiprocessing import Process, Queue
from time import sleep
from random import uniform
def doCalculation(par):
t = uniform(0,2)
sleep(t)
return par * par # just to simulate some calculation
def feed(queue, parlist):
for par in parlist:
queue.put(par)
def calc(queueIn, queueOut):
while True:
try:
par = queueIn.get(block = False)
print "dealing with ", par, ""
res = doCalculation(par)
queueOut.put((par,res))
except:
break
def write(queue, fname):
fhandle = open(fname, "w")
while True:
try:
par, res = queue.get(block = False)
print >>fhandle, par, res
except:
break
fhandle.close()
if __name__ == "__main__":
nthreads = multiprocessing.cpu_count()
fname = "foo"
workerQueue = Queue()
writerQueue = Queue()
parlist = [1,2,3,4,5,6,7,8,9,10]
feedProc = Process(target = feed , args = (workerQueue, parlist))
calcProc = [Process(target = calc , args = (workerQueue, writerQueue)) for i in range(nthreads)]
writProc = Process(target = write, args = (writerQueue, fname))
feedProc.start()
for p in calcProc:
p.start()
writProc.start()
feedProc.join ()
for p in calcProc:
p.join()
writProc.join ()
You really should use two queues and three separate kinds of processing.
Put stuff into Queue #1.
Get stuff out of Queue #1 and do calculations, putting stuff in Queue #2. You can have many of these, since they get from one queue and put into another queue safely.
Get stuff out of Queue #2 and write it to a file. You must have exactly 1 of these and no more. It "owns" the file, guarantees atomic access, and absolutely assures that the file is written cleanly and consistently.
If anyone is looking for a simple way to do the same, this can help you.
I don't think there are any disadvantages to doing it in this way. If there are, please let me know.
import multiprocessing
import re
def mp_worker(item):
# Do something
return item, count
def mp_handler():
cpus = multiprocessing.cpu_count()
p = multiprocessing.Pool(cpus)
# The below 2 lines populate the list. This listX will later be accessed parallely. This can be replaced as long as listX is passed on to the next step.
with open('ExampleFile.txt') as f:
listX = [line for line in (l.strip() for l in f) if line]
with open('results.txt', 'w') as f:
for result in p.imap(mp_worker, listX):
# (item, count) tuples from worker
f.write('%s: %d\n' % result)
if __name__=='__main__':
mp_handler()
Source: Python: Writing to a single file with queue while using multiprocessing Pool
There is a mistake in the write worker code, if the block is false, the worker will never get any data. Should be as follows:
par, res = queue.get(block = True)
You can check it by adding line
print "QSize",queueOut.qsize()
after the
queueOut.put((par,res))
With block=False you would be getting ever increasing length of the queue until it fills up, unlike with block=True where you get always "1".
Related
I have a function generate(file_path) which returns an integer index and a numpy array. The simplified of generate function is as follows:
def generate(file_path):
temp = np.load(file_path)
#get index from the string file_path
idx = int(file_path.split["_"][0])
#do some mathematical operation on temp
result = operate(temp)
return idx, result
I need to glob through a directory and collect the results of generate(file_path) into a hdf5 file. My serialization code is as follows:
for path in glob.glob(directory):
idx, result = generate(path)
hdf5_file["results"][idx,:] = result
hdf5_file.close()
I hope to write a multi-thread or multi-process code to speed up the above code. How could I modify it? Pretty thanks!
My try is to modify my generate function and to modify my "main" as follows:
def generate(file_path):
temp = np.load(file_path)
#get index from the string file_path
idx = int(file_path.split["_"][0])
#do some mathematical operation on temp
result = operate(temp)
hdf5_path = "./result.hdf5"
hdf5_file = h5py.File(hdf5_path, 'w')
hdf5_file["results"][idx,:] = result
hdf5_file.close()
if __name__ == '__main__':
##construct hdf5 file
hdf5_path = "./output.hdf5"
hdf5_file = h5py.File(hdf5_path, 'w')
hdf5_file.create_dataset("results", [2000,15000], np.uint8)
hdf5_file.close()
path_ = "./compute/*"
p = Pool(mp.cpu_count())
p.map(generate, glob.glob(path_))
hdf5_file.close()
print("finished")
However, it does not work. It will throw error
KeyError: "Unable to open object (object 'results' doesn't exist)"
You can use a thread or process pool to execute multiple function calls concurrently. Here is an example which uses a process pool:
from concurrent.futures import ProcessPoolExecutor
from time import sleep
def generate(file_path: str) -> int:
sleep(1.0)
return file_path.split("_")[1]
def main():
file_paths = ["path_1", "path_2", "path_3"]
with ProcessPoolExecutor() as pool:
results = pool.map(generate, file_paths)
for result in results:
# Write to the HDF5 file
print(result)
if __name__ == "__main__":
main()
Note that you should not write to the same HDF5 file concurrently, i.e. the file writing should not append in the generate function.
I detected some errors in initialising the dataset after examining your code;
You produced the hdf5 file with the path ""./result.hdf5" inside the generate function.
However, I think you neglected to create a "results" dataset beneath that file, as that is what is causing the Object Does Not Exist issue.
Kindly reply if you still face the same issue with error message
I have this program that uses Python threading to read different lines in a file, if it reads a duplicate line then reads another one, and once It has read it, removes the line from the file. The problem is that whenever it reads the file It doesn't update the file, or I'm not quite sure what's happening. It can sometimes read the same line as before therefore breaking it. I'm not sure if my code is the most effective way to do this?
def read_tokens_list():
tokens = []
with open('inputTokens.txt', 'r', encoding='UTF-8') as file:
lines = file.readlines()
for line in lines:
tokens.append(line.replace('\n', ''))
return tokens
def worker(token_list):
while True:
token = random.choice(token_list)
print(token)
ver = open("Fullyverified.txt", "a+")
ver.write(token + "\n")
with open("inputTokens.txt", "r") as f:
lines = f.readlines()
with open("inputTokens.txt", "w") as f:
for line in lines:
if line.strip("\n") != token:
f.write(line)
time.sleep(1)
def main():
threads = []
num_thread = input('Number of Threads: ')
num_thread = int(num_thread)
token_list = read_tokens_list() # read in the pokens.txt file
random.shuffle(token_list) # shuffle the list into random order
tokens_per_worker = len(token_list) // num_thread # how many tokens from the list each worker will get (roughly)
for i in range(num_thread):
if ((i+1)<num_thread):
num_tokens_for_this_worker = tokens_per_worker # give each worker an even share of the list
else:
num_tokens_for_this_worker = len(token_list) # except the last worker gets whatever is left
# we'll give the first (num_tokens_for_this_worker) tokens in the list to this worker
tokens_for_this_worker = token_list[0:num_tokens_for_this_worker]
# and remove those tokens from the list so that they won't get used by anyone else
token_list = token_list[num_tokens_for_this_worker:]
t = threading.Thread(target=worker, args= (tokens_for_this_worker, ))
threads.append(t)
t.start()
for t in threads:
t.join()
if __name__ == '__main__':
main()
Use a lock.
something like:
from threading import Lock
# ...
lock = Lock()
# ...
def worker(token_list, lock = lock):
# ...
with lock:
with open("inputTokens.txt", "r") as f:
lines = f.readlines()
with open("inputTokens.txt", "w") as f:
for line in lines:
if line.strip("\n") != token:
f.write(line)
# ...
The idea of the lock is to protect resources from being accessed by various threads simultaneously. So while one thread is working with the file, the others are waiting.
The next question is if this approach makes sense now, because depending of the size of your file, threads might be stuck waiting for the lock most of the time.
What about a database instead of a file? so you don't have to rewrite a full file, but just delete/update an entry
I have to parse 30 days access logs from the server based on client IP and accessed hosts and need to know top 10 accessed sites. The log file will be around 10-20 GB in size which takes lots of time for single threaded execution of script. Initially, I wrote a script which was working fine but it is taking a lot of time to due to large log file size. Then I tried to implement multiprocessing library for parallel processing but it is not working. It seems implementation of multiprocessing is repeating tasks instead of doing parallel processing. Not sure, what is wrong in the code. Can some one please help on this? Thank you so much in advance for your help.
Code:
from datetime import datetime, timedelta
import commands
import os
import string
import sys
import multiprocessing
def ipauth (slave_list, static_ip_list):
file_record = open('/home/access/top10_domain_accessed/logs/combined_log.txt', 'a')
count = 1
while (count <=30):
Nth_days = datetime.now() - timedelta(days=count)
date = Nth_days.strftime("%Y%m%d")
yr_month = Nth_days.strftime("%Y/%m")
file_name = 'local2' + '.' + date
with open(slave_list) as file:
for line in file:
string = line.split()
slave = string[0]
proxy = string[1]
log_path = "/LOGS/%s/%s" %(slave, yr_month)
try:
os.path.exists(log_path)
file_read = os.path.join(log_path, file_name)
with open(file_read) as log:
for log_line in log:
log_line = log_line.strip()
if proxy in log_line:
file_record.write(log_line + '\n')
except IOError:
pass
count = count + 1
file_log = open('/home/access/top10_domain_accessed/logs/ipauth_logs.txt', 'a')
with open(static_ip_list) as ip:
for line in ip:
with open('/home/access/top10_domain_accessed/logs/combined_log.txt','r') as f:
for content in f:
log_split = content.split()
client_ip = log_split[7]
if client_ip in line:
content = str(content).strip()
file_log.write(content + '\n')
return
if __name__ == '__main__':
slave_list = sys.argv[1]
static_ip_list = sys.argv[2]
jobs = []
for i in range(5):
p = multiprocessing.Process(target=ipauth, args=(slave_list, static_ip_list))
jobs.append(p)
p.start()
p.join()
UPDATE AFTER CONVERSATION WITH OP, PLEASE SEE COMMENTS
My take: Split the file into smaller chunks and use a process pool to work on those chunks:
import multiprocessing
def chunk_of_lines(fp, n):
# read n lines from file
# then yield
pass
def process(lines):
pass # do stuff to a file
p = multiprocessing.Pool()
fp = open(slave_list)
for f in chunk_of_lines(fp,10):
p.apply_async(process, [f,static_ip_list])
p.close()
p.join() # Wait for all child processes to close.
There are many ways to implement the chunk_of_lines method, you could iterate over the file lines using a simple for or do something more advance like call fp.read().
Following code I'm using for parallel csv processing:
#!/usr/bin/env python
import csv
from time import sleep
from multiprocessing import Pool
from multiprocessing import cpu_count
from multiprocessing import current_process
from pprint import pprint as pp
def init_worker(x):
sleep(.5)
print "(%s,%s)" % (x[0],x[1])
x.append(int(x[0])**2)
return x
def parallel_csv_processing(inputFile, outputFile, header=["Default", "header", "please", "change"], separator=",", skipRows = 0, cpuCount = 1):
# OPEN FH FOR READING INPUT FILE
inputFH = open(inputFile, "rt")
csvReader = csv.reader(inputFH, delimiter=separator)
# SKIP HEADERS
for skip in xrange(skipRows):
csvReader.next()
# PARALLELIZE COMPUTING INTENSIVE OPERATIONS - CALL FUNCTION HERE
try:
p = Pool(processes = cpuCount)
results = p.map(init_worker, csvReader, chunksize = 10)
p.close()
p.join()
except KeyboardInterrupt:
p.close()
p.join()
p.terminate()
# CLOSE FH FOR READING INPUT
inputFH.close()
# OPEN FH FOR WRITING OUTPUT FILE
outputFH = open(outputFile, "wt")
csvWriter = csv.writer(outputFH, lineterminator='\n')
# WRITE HEADER TO OUTPUT FILE
csvWriter.writerow(header)
# WRITE RESULTS TO OUTPUT FILE
[csvWriter.writerow(row) for row in results]
# CLOSE FH FOR WRITING OUTPUT
outputFH.close()
print pp(results)
# print len(results)
def main():
inputFile = "input.csv"
outputFile = "output.csv"
parallel_csv_processing(inputFile, outputFile, cpuCount = cpu_count())
if __name__ == '__main__':
main()
I would like to somehow measure the progress of the script (just plain text not any fancy ASCII art). The one option that comes to my mind is to compare the lines that were successfully processed by init_worker to all lines in input.csv, and print the actual state e.g. every second, can you please point me to right solution? I've found several articles with similar problematic but I was not able to adapt it to my needs because neither used the Pool class and map method. I would also like to ask about p.close(), p.join(), p.terminate() methods, I've seen them mainly with Process not Pool class, are they necessary with Pool class and have I use them correctly? Using of p.terminate() was intended to kill the process with ctrl+c but this is different story which has not an happy end yet. Thank you.
PS: My input.csv looks like this, if it matters:
0,0
1,3
2,6
3,9
...
...
48,144
49,147
PPS: as I said I'm newbie in multiprocessing and the code I've put together just works. The one drawback I can see is that whole csv is stored in memory, so if you guys have better idea do not hesitate to share it.
Edit
in reply to #J.F.Sebastian
Here is my actual code based on your suggestions:
#!/usr/bin/env python
import csv
from time import sleep
from multiprocessing import Pool
from multiprocessing import cpu_count
from multiprocessing import current_process
from pprint import pprint as pp
from tqdm import tqdm
def do_job(x):
sleep(.5)
# print "(%s,%s)" % (x[0],x[1])
x.append(int(x[0])**2)
return x
def parallel_csv_processing(inputFile, outputFile, header=["Default", "header", "please", "change"], separator=",", skipRows = 0, cpuCount = 1):
# OPEN FH FOR READING INPUT FILE
inputFH = open(inputFile, "rb")
csvReader = csv.reader(inputFH, delimiter=separator)
# SKIP HEADERS
for skip in xrange(skipRows):
csvReader.next()
# OPEN FH FOR WRITING OUTPUT FILE
outputFH = open(outputFile, "wt")
csvWriter = csv.writer(outputFH, lineterminator='\n')
# WRITE HEADER TO OUTPUT FILE
csvWriter.writerow(header)
# PARALLELIZE COMPUTING INTENSIVE OPERATIONS - CALL FUNCTION HERE
try:
p = Pool(processes = cpuCount)
# results = p.map(do_job, csvReader, chunksize = 10)
for result in tqdm(p.imap_unordered(do_job, csvReader, chunksize=10)):
csvWriter.writerow(result)
p.close()
p.join()
except KeyboardInterrupt:
p.close()
p.join()
# CLOSE FH FOR READING INPUT
inputFH.close()
# CLOSE FH FOR WRITING OUTPUT
outputFH.close()
print pp(result)
# print len(result)
def main():
inputFile = "input.csv"
outputFile = "output.csv"
parallel_csv_processing(inputFile, outputFile, cpuCount = cpu_count())
if __name__ == '__main__':
main()
Here is output of tqdm:
1 [elapsed: 00:05, 0.20 iters/sec]
what does this output mean? On the page you've referred tqdm is used in loop following way:
>>> import time
>>> from tqdm import tqdm
>>> for i in tqdm(range(100)):
... time.sleep(1)
...
|###-------| 35/100 35% [elapsed: 00:35 left: 01:05, 1.00 iters/sec]
This output makes sense, but what does my output mean? Also it does not seems that ctrl+c problem is fixed: after hitting ctrl+c script throws some Traceback, if I hit ctrl+c again then I get new Traceback and so on. The only way to kill it is sending it to background (ctr+z) and then kill it (kill %1)
To show the progress, replace pool.map with pool.imap_unordered:
from tqdm import tqdm # $ pip install tqdm
for result in tqdm(pool.imap_unordered(init_worker, csvReader, chunksize=10)):
csvWriter.writerow(result)
tqdm part is optional, see Text Progress Bar in the Console
Accidentally, it fixes your "whole csv is stored in memory" and "KeyboardInterrupt is not raised" problems.
Here's a complete code example:
#!/usr/bin/env python
import itertools
import logging
import multiprocessing
import time
def compute(i):
time.sleep(.5)
return i**2
if __name__ == "__main__":
logging.basicConfig(format="%(asctime)-15s %(levelname)s %(message)s",
datefmt="%F %T", level=logging.DEBUG)
pool = multiprocessing.Pool()
try:
for square in pool.imap_unordered(compute, itertools.count(), chunksize=10):
logging.debug(square) # report progress by printing the result
except KeyboardInterrupt:
logging.warning("got Ctrl+C")
finally:
pool.terminate()
pool.join()
You should see the output in batches every .5 * chunksize seconds. If you press Ctrl+C; you should see KeyboardInterrupt raised in the child processes and in the main process. In Python 3, the main process exits immediately. In Python 2, the KeyboardInterrupt is delayed until the next batch should have been printed (bug in Python).
I have a lot of files(300~500) to read, and I want to accelerate this task.
The idealization is:
from multiprocessing import Pool
import os
import _io
filelist = map(open,os.listdir())
if __name__ == '__main__':
with Pool() as pool:
a = pool.map(_io.TextIOWrapper.read,filelist)
Of course, I got an error:
TypeError: cannot serialize '_io.TextIOWrapper' object
The question is: Can I accelerate I/O process by parallelism? If yes, how to?
UPDATE conclusion:
Now I get the way to parallelism and have tested my code:
I used 22 items, totalling 63.2 MB
from multiprocessing import Pool
import os
import _io
def my_read(file_name):
with open(file_name) as f:
return f.read()
def mul():
with Pool() as pool:
a = pool.map(my_read, os.listdir())
def single():
a = []
for i in os.listdir():
with open(i) as f:
r = f.read()
a.append(r)
if __name__ == '__main__':
mul()
# single()
Sadly, single() costs 0.4s while mul() costs 0.8s.
UPDATE 1:
Some people said it's an IO-bound task so I can not improve it by parallelism。
However, I can find these words in Python doc:
However, threading is still an appropriate model if you want to run multiple I/O-bound tasks simultaneously.
The full code is here:
My purpose is to transfer Epub to txt.
I have parallelized char2text and now I want to accelerate readall:
import zipfile
from multiprocessing import Pool
import bs4
def char2text(i):
soup = bs4.BeautifulSoup(i)
chapter = soup.body.getText().splitlines()
chapter = "\n".join(chapter).strip() + "\n\n"
return chapter
class Epub(zipfile.ZipFile):
def __init__(self, file, mode='r', compression=0, allowZip64=False):
zipfile.ZipFile.__init__(self, file, mode, compression, allowZip64)
if mode == 'r':
self.opf = self.read('OEBPS/content.opf').decode()
opf_soup = bs4.BeautifulSoup(self.opf)
self.author = opf_soup.find(name='dc:creator').getText()
self.title = opf_soup.find(name='dc:title').getText()
try:
self.description = opf_soup.find(name='dc:description').getText()
except:
self.description = ''
try:
self.chrpattern = opf_soup.find(name='dc:chrpattern').getText()
except:
self.chrpattern = ''
self.cover = self.read('OEBPS/images/cover.jpg')
elif mode == 'w':
pass
def get_text(self):
self.tempread = ""
charlist = self.readall(self.namelist())
with Pool() as pool:
txtlist = pool.map(char2text, charlist)
self.tempread = "".join(txtlist)
return self.tempread
def readall(self, namelist):
charlist = []
for i in namelist:
if i.startswith('OEBPS/') and i.endswith('.xhtml'):
r = self.read(i).decode()
charlist.append(r)
return charlist
def epub2txt(self):
tempread = self.get_text()
with open(self.title + '.txt', 'w', encoding='utf8') as f:
f.write(tempread)
if __name__ == "__main__":
e = Epub("assz.epub")
import cProfile
cProfile.run("e.epub2txt()")
Did you try something like:
from multiprocessing import Pool
import os
import _io
def my_read(file_name):
with open(file_name) as f:
return _io.TextIOWrapper.read(f)
if __name__ == '__main__':
with Pool() as pool:
a = pool.map(my_read, os.listdir('some_dir'))
Is sounds more logical to me to open/close the file in the sub-process and string are easily serializable.
for your readall method try:
def readall(self, namelist):
filter_func = lambda i: i.startswith('OEBPS/') and i.endswith('.xhtml')
read_fun= lambda i: self.read(i).decode()
with Pool() as pool:
a = pool.map(read_fun, filter(filter_func, namelist))
return a