a very confusing problem about python multiprocessing - python

I get a problem in my job to filter file ,for example,if i run this code first,it is slow,and run until global_step=100000,then i terminate this code,then i run again,it will fast until global_step=100000,then it will slow.i can't find why it happened like this,can someone give me a advise,i have a poor english
from multiprocessing import Queue, Process, Value
read_dir = '..'
write_dir ='..'
dict_dir = '..'
Q_read = Queue(10000)
Q_write = Queue(10000)
global_step = Value('i', 0)
def Push_queue(Q_read, r_dir):
f = open(r_dir, 'r')
lines = f.readlines()
for line in lines:
Q_read.put(line)
f.close()
def Write_from_Queue(Q_write, w_dir):
fw = open(w_dir, 'w')
while True:
try:
line = Q_write.get(timeout=30)
fw.write(line)
fw.flush()
except:
fw.close()
return
def asy_run(Q_read, Q_write, global_step, char2ind_dict):
while True:
line = Q_read.get(timeout=30)
#########################
line = .......do something
#########################
Q_write.put(line)
global_step.value +=1
def main_run(num, char2ind_dict):
process_list = []
process_push = Process(target=Push_queue, args=(Q_read, read_dir))
process_push.start()
for i in range(num):
process_i = Process(target=asy_run, args=(Q_read, Q_write, global_step, char2ind_dict))
process_i.start()
process_list.append(process_i)
process_write = Process(target=Write_from_Queue, args=(Q_write, write_dir))
process_write.start()
process_push.join()
Q_read.join()
Q_write.join()
for p in process_list:
p.join()
process_write.join()
if __name__ =='__main__':
char2ind_dict = get_dict(dict_dir)
main_run(50, char2ind_dict)

Related

Efficiently Create 1% Sample Using Multiprocessing in Python

I'm trying to process a large dataset (300GB) line by line using multiprocessing. I want to define a 1% random sample based one variable. My first step is to define the sample and then I want to read the data file using multiprocessing. I'm guessing that the script will run faster if the part where I define the set used for the random sample isn't run for each child. However, when I try to move that part of the script under the line if __name__ == "__main__": The child programs no longer seem to recognize random sample from the parent. I get the error:
NameError: name 'id_pct1' is not defined
Where is the most efficient place to put the portion of the script where I define the random sample?
#define sample
uid = list(line.strip() for line in open('Subsets/unique_ids_final.txt'))
pct1 = round(len(uid)/100)
random.seed(1)
id_pct1 = set(random.sample(uid, k=pct1))
id_pct1.add(vname)
#read original file and write 1% sample using multiprocessing
def worker(chunkStart, chunkSize, q):
with open('myfile.txt') as f:
tlines = []
f.seek(chunkStart)
lines = f.read(chunkSize).splitlines()
for line in lines:
data = line.split('*')
if data[30] in id_pct1: tlines.append(line)
q.put(tlines)
return tlines
def chunkify(fname,size=1024*1024):
fileEnd = os.path.getsize(fname)
with open(fname, 'r') as f:
chunkEnd2 = 0
while True:
chunkStart = chunkEnd2
f.seek(chunkStart)
f.read(size)
chunkEnd1 = f.tell()
f.readline()
chunkEnd2 = f.tell()
chunkSz = 1024*1024 + chunkEnd2 - chunkEnd1 - 1
yield chunkStart, chunkSz
if chunkEnd2 >= fileEnd:
break
def listener(q):
with open('myfile1pct.txt', 'w') as out_f1:
while True:
m = q.get()
if m == 'kill': break
else:
for line in m:
out_f1.write(line+'\n')
out_f1.flush()
def main():
manager = mp.Manager()
q = manager.Queue()
pool = mp.Pool()
watcher = pool.apply_async(listener, (q,))
jobs = []
for chunkStart, chunkSize in chunkify('myfile.txt'):
jobs.append(pool.apply_async(worker,(chunkStart,chunkSize,q)))
for job in jobs:
job.get()
q.put('kill')
pool.close()
pool.join()
if __name__ == '__main__':
main()
If you want those items in #define to be available throughout the entire program, you should use the keyword global in front of it. However, declaring global variables is generally considered bad practice. You should consider just incorporating your #define logic in your functions like so:
#read original file and write 1% sample using multiprocessing
def worker(chunkStart, chunkSize, q):
#define sample
uid = list(line.strip() for line in open('Subsets/unique_ids_final.txt'))
pct1 = round(len(uid)/100)
random.seed(1)
id_pct1 = set(random.sample(uid, k=pct1))
id_pct1.add(vname)
with open('myfile.txt') as f:
tlines = []
f.seek(chunkStart)
lines = f.read(chunkSize).splitlines()
for line in lines:
data = line.split('*')
if data[30] in id_pct1: tlines.append(line)
q.put(tlines)
return tlines
def chunkify(fname,size=1024*1024):
fileEnd = os.path.getsize(fname)
with open(fname, 'r') as f:
chunkEnd2 = 0
while True:
chunkStart = chunkEnd2
f.seek(chunkStart)
f.read(size)
chunkEnd1 = f.tell()
f.readline()
chunkEnd2 = f.tell()
chunkSz = 1024*1024 + chunkEnd2 - chunkEnd1 - 1
yield chunkStart, chunkSz
if chunkEnd2 >= fileEnd:
break
def listener(q):
with open('myfile1pct.txt', 'w') as out_f1:
while True:
m = q.get()
if m == 'kill': break
else:
for line in m:
out_f1.write(line+'\n')
out_f1.flush()
def main():
manager = mp.Manager()
q = manager.Queue()
pool = mp.Pool()
watcher = pool.apply_async(listener, (q,))
jobs = []
for chunkStart, chunkSize in chunkify('myfile.txt'):
jobs.append(pool.apply_async(worker,(chunkStart,chunkSize,q)))
for job in jobs:
job.get()
q.put('kill')
pool.close()
pool.join()
if __name__ == '__main__':
main()

Python I/O multiprocessing with no Return on function

I have a working python script that, in a simplified way, works as follows:
open("A", 'r')
open("B", 'r')
open("C", 'w')
for lineA in A:
part1, part2, part3 = lineA.split(' ')
for lineB in B:
if part2 in lineB:
C.write(lineB)
I want to check in file B if a section of the line of file A exists there. If so, write that whole line from file B in a new file C.
The process is somewhat time consuming the way I have designed it (1-I still consider myself a noob with Python, 2-There are at least 4 IF statements running inside the main FOR loop), and now I have started to use input files around 200x larger than previously, so I am getting times of around 5 hours per input file here.
I have tried to use multiprocessing but I can't seem to get it to work.
I tried a simple code inside my main() function initially, without any significant improvement and definitely without using more than one CPU:
p = Process(target=multi_thread, args=(arg1,arg2,arg3))
p.start()
p.join()
Then I tried the jobs approach:
jobs = []
for i in range(4):
p = Process(target='myfunc')
jobs.append(p)
p.start()
p.join()
And a pool example I found here in the forums, to which I added a Return statement to my main function:
def multiproc(arg1,arg2,arg3):
(...)
return lineB # example of Return statment
def main():
pool = Pool(4)
with open('file.txt', 'w') as map_file:
# chunk the work into batches of 4 lines at a time
results = pool.map(multi_thread, map_file, 4)
if __name__ == "__main__":
main()
The jobs approach actually created the file and then restarted 3 more times the whole process from scratch. This last one gives me the following error:
io.UnsupportedOperation: not readable
And I also suppose that my Return statement is breaking my loop...
Any suggestions to enable multiprocessing for this piece of code, or also to improve its neatness?
Thanks!
EDIT:
As requested, here is the full messy code:
#!/usr/bin/python3
__author__ = 'daniel'
import os
import re
from multiprocessing import Process
from multiprocessing import Pool
import time
start_time = time.time()
def multi_thread(filePath, datasetFolder, mapFileDataset):
fout = open('outdude.txt', 'w')
cwd = os.getcwd()
cwdgen, sep, id = filePath.rpartition('/')
dataset = datasetFolder.rsplit("/",1)
dataset = dataset[1]
## Create file
for i in os.listdir(cwd):
if ".ped" in i:
sample_id, sep, rest = i.partition('.ped')
for i in os.listdir(cwd):
if sample_id+'.pileupgatk' in i and dataset in i:
pileup4map = open(i,'r')
snpcounter = sum(1 for _ in pileup4map)-1
pileup4map.seek(0)
mapout = open(sample_id+'.map', 'w')
counter = 1
for line in pileup4map:
if counter <= snpcounter:
mapFileData = open(datasetFolder+'/'+mapFileDataset,'r')
line = line.rstrip()
chro, coord, refb, rbase, qual = line.split(' ')
chrom = chro.strip("chr")
counter+=1
for ligna in mapFileData:
if coord in ligna:
k = re.compile(r'(?=%s )' % coord, re.I)
lookAhead = k.search(ligna)
k = re.compile(r'(?<= %s)' % coord, re.I)
lookBehind = k.search(ligna)
if lookAhead and lookBehind != None:
lignaChrom = ligna[:2].rstrip(' ')
if chrom == lignaChrom:
lignaOut = ligna.rstrip()
mapout.write(lignaOut+'\n')
## For POOL
return lignaOut
else:
pass
else:
pass
else:
pass
mapout.close()
def main():
#Multiproc
# p = Process(target=multi_thread, args=('/home/full_karyo.fa', '/home/haak15', 'dataPP.map'))
# p.start()
# p.join()
# print("--- %s seconds ---" % (time.time() - start_time))
#Jobs
# jobs = []
# for i in range(4):
# p = Process(target=multi_thread, args=('/home/full_karyo.fa', '/home/haak15', 'dataPP.map'))
# jobs.append(p)
# p.start()
# p.join()
#Pool
pool = Pool(4)
with open('file.txt', 'w') as map_file:
# chunk the work into batches of 4 lines at a time
results = pool.map(multi_thread, map_file, 4)
print(results)
print("--- %s seconds ---" % (time.time() - start_time))
if __name__ == "__main__":
main()
EDIT2:
Following Robert E and TheBigC's advises I re-wrote my code and it is now 13x faster, and not as confusing. I used a dictionary approach that is not as I/O hungry as the previous one, as TheBigC pointed. I am happy enough with the speed so I will leave multiprocessing aside for now. Thanks for the comments!
if makemap == True:
## Dictionary method - 13X faster
for i in os.listdir(cwd):
if ".ped" in i:
sample_id, sep, rest = i.partition('.ped')
for i in os.listdir(cwd):
if sample_id+'.pileupgatk' in i and dataset in i:
print("\n\t> Creating MAP file from sample: "+sample_id)
pileup4map = open(i,'r')
snpcounter = sum(1 for _ in pileup4map)-1
pileup4map.seek(0)
counter = 1
piledic = {}
for line in pileup4map:
if counter <= snpcounter:
line = line.rstrip()
#chr21 43805965 G G G
chro, coord, refb, rbase, qual = line.split(' ')
chrom = chro.strip("chr")
piledic[chrom,coord]=int(counter)
counter += 1
pileup4map.close()
mapFileData = open(datasetFolder+'/'+mapFileDataset,'r')
mapDic = {}
counterM =1
for ligna in mapFileData:
#22 Affx-19821577 0.737773 50950707 A G
chroMap,ident,prob,posMap,bas1,bas2 = ligna.split()
mapDic[chroMap,posMap]=int(counterM)
counterM +=1
listOfmatches = []
for item in piledic:
if item in mapDic:
listOfmatches.append(mapDic[item])
listOfmatches.sort()
mapWrite = open(sample_id+".map", 'w')
mapFileData.seek(0)
lineCounter = 1
for lignagain in mapFileData:
if lineCounter in listOfmatches:
mapWrite.write(lignagain)
lineCounter +=1
mapWrite.close()
mapFileData.close()

How to return a list by using multi thread in Python

Background:
I want to convert a list of strings into float value.
However it's too time consuming due to the large size of the list.
So I decide to use multi-process or multi-thread.
For multi-process I have 3770 issue and failed.
For multi-thread I cannot return a list.
Does anyone can guide me please?
Code:
import random
import threading
def convert(strList):
fList = []
fList = map(float, strList)
return fList
line = ""
for i in xrange(0,500000):
line += str(random.uniform(-1,1))+" "
line += "\n"
fobj_w = open("data","w")
fobj_w.write(line)
fobj_w.close()
fobj_r = open("data","r")
line = fobj_r.readline()
line = line.strip(" \n")
line_ls = line.split(" ")
threads = []
t1 = threading.Thread(target=convert,args=(line_ls[:100000],))
t2 = threading.Thread(target=convert,args=(line_ls[100000:200000],))
t3 = threading.Thread(target=convert,args=(line_ls[200000:300000],))
t4 = threading.Thread(target=convert,args=(line_ls[300000:400000],))
t5 = threading.Thread(target=convert,args=(line_ls[400000:],))
threads.append(t1)
threads.append(t2)
threads.append(t3)
threads.append(t4)
threads.append(t5)
for t in threads:
t.setDaemon(True)
t.start()
t.join()
#how to get the return list here??
You can't just use numpy's fromstring() function for this?
import numpy as np
line = ""
for i in xrange(0,500000):
line += str(random.uniform(-1,1))+" "
num_line = np.fromstring(line, dtype=float, sep=" ")

Output between single-threaded and multi-threaded versions of same application differs [Python]

I have written 2 versions of a program to parse a log file and return the number of strings that match a given regex. The single-threaded version return the correct output
Number of Orders ('ORDER'): 1108
Number of Replacements ('REPLACE'): 742
Number of Orders and Replacements: 1850
Time to process: 5.018553
The multithreaded program however returns erroneous values:
Number of Orders ('ORDER'): 1579
Number of Replacements ('REPLACE'): 1108
Number of Orders and Replacements: 2687
Time to process: 2.783091
The time can vary (it should be faster for the multithreaded one) but I can't seem to find why the values for orders and replacements differ between the two versions.
Here is the multithreaded version:
import re
import time
import sys
import threading
import Queue
class PythonLogParser:
queue = Queue.Queue()
class FileParseThread(threading.Thread):
def __init__(self, parsefcn, f, startind, endind, olist):
threading.Thread.__init__(self)
self.parsefcn = parsefcn
self.startind = startind
self.endind = endind
self.olist = olist
self.f = f
def run(self):
self.parsefcn(self.f, self.startind, self.endind, self.olist)
def __init__(self, filename):
assert(len(filename) != 0)
self.filename = filename
self.start = 0
self.end = 0
def open_file(self):
f = None
try:
f = open(self.filename)
except IOError as e:
print 'Unable to open file:', e.message
return f
def count_orders_from(self, f, starting, ending, offset_list):
f.seek(offset_list[starting])
order_pattern = re.compile(r'.*(IN:)(\s)*(ORDER).*(ord_type)*')
replace_pattern = re.compile(r'.*(IN:)(\s)*(REPLACE).*(ord_type)*')
order_count=replace_count = 0
for line in f:
if order_pattern.match(line) != None:
order_count+=1 # = order_count + 1
if replace_pattern.match(line) != None:
replace_count+=1 # = replace_count + 1
#return (order_count, replace_count, order_count+replace_count)
self.queue.put((order_count, replace_count, order_count+replace_count))
def get_file_data(self):
offset_list = []
offset = 0
num_lines = 0
f = 0
try:
f = open(self.filename)
for line in f:
num_lines += 1
offset_list.append(offset)
offset += len(line)
f.close()
finally:
f.close()
return (num_lines, offset_list)
def count_orders(self):
self.start = time.clock()
num_lines, offset_list = self.get_file_data()
start_t1 = 0
end_t1 = num_lines/2
start_t2 = end_t1 + 1
f = open(self.filename)
t1 = self.FileParseThread(self.count_orders_from, f, start_t1, end_t1, offset_list)
self.count_orders_from(f, start_t2, num_lines, offset_list)
t1.start()
self.end = time.clock()
tup1 = self.queue.get()
tup2 = self.queue.get()
order_count1, replace_count1, sum1 = tup1
order_count2, replace_count2, sum2 = tup2
print 'Number of Orders (\'ORDER\'): {0}\n'\
'Number of Replacements (\'REPLACE\'): {1}\n'\
'Number of Orders and Replacements: {2}\n'\
'Time to process: {3}\n'.format(order_count1+order_count2, \
replace_count1+replace_count2, \
sum1+sum2, \
self.end - self.start)
f.close()
def test2():
p = PythonLogParser('../../20150708.aggregate.log')
p.count_orders()
def main():
test2()
main()
The idea is that since the file is large, each thread will read half the file. t1 reads the first half and the main thread reads the second. The main thread then adds together the results from both iterations and displays them.
My suspicion is that somehow the order_count and replace_count in count_orders_from are being modified between threads rather than starting at 0 for each thread, but I'm not sure since I don't see why separate calls to a method from 2 separate threads would modify the same variables.
The error was occurring because even though in theory the threads were parsing individual halves, what was in fact happening is that one thread parsed halfway and the other one parsed the full file, so items were double counted. This error was fixed by adding a linecount variable to count_orders_from, in order to check whether the reader has reached the line it is supposed to read to.
def count_orders_from(self, f, starting, ending, offset_list):
f.seek(offset_list[starting])
order_pattern = re.compile(r'.*(IN:)(\s)*(ORDER).*(ord_type)*')
replace_pattern = re.compile(r'.*(IN:)(\s)*(REPLACE).*(ord_type)*')
order_count=replace_count=linecount = 0
for line in f:
if order_pattern.match(line) != None:
order_count+=1 # = order_count + 1
if replace_pattern.match(line) != None:
replace_count+=1 # = replace_count + 1
if linecount==ending:
break
linecount+=1
self.queue.put((order_count, replace_count, order_count+replace_count))

Python syntax error, command, *args= line.split()

I am getting a Python syntax error for command, *args = line.split()
Here is my bce.py file
import sys
import os
import pcn
import urp
class BCE(object):
def __init__(self, inPath, outPath):
self.inPath = inPath
self.outPath = outPath
self.eqs = {}
self.operations = {
"r": self.read_pcn,
"!": self.do_not,
"+": self.do_or,
"&": self.do_and,
"p": self.write_pcn,
"q": self.quit
}
self.done = False
def process(self, commandFilePath):
with open(commandFilePath, "r") as f:
for line in f:
command, *args = line.split()
self.operations[command](*args)
if self.done:
return
def read_pcn(self, fNum):
_, self.eqs[fNum] = pcn.parse(os.path.join(self.inPath, fNum + ".pcn"))
def write_pcn(self, fNum):
with open(os.path.join(self.outPath, fNum + ".pcn"), "w") as f:
pcn.write(f, None, self.eqs[fNum])
def do_not(self, resultNum, inNum):
self.eqs[resultNum] = urp.complement(self.eqs[inNum])
def do_or(self, resultNum, leftNum, rightNum):
self.eqs[resultNum] = urp.cubes_or(self.eqs[leftNum], self.eqs[rightNum])
def do_and(self, resultNum, leftNum, rightNum):
self.eqs[resultNum] = urp.cubes_and(self.eqs[leftNum], self.eqs[rightNum])
def quit(self):
self.done = True
Usage = """\
USAGE: {} COMMAND_FILE
"""
if __name__ == "__main__":
if len(sys.argv) > 1:
solutionDir = "BCESolutions"
thisSolDir = os.path.join(solutionDir, sys.argv[1][-5])
try:
os.mkdir(thisSolDir)
except OSError:
# It's okay if it's already there
pass
bce = BCE("BooleanCalculatorEngine", thisSolDir)
bce.process(sys.argv[1])
else:
print(Usage.format(sys.argv[0]))
And here is my pcn.py file
from itertools import islice
from itertools import chain
def parse(filePath):
with open(filePath, "rb") as f:
# First line is size of array
try:
lines = iter(f)
numVars = int(next(lines))
cubeCount = int(next(lines))
cubes = [None]*cubeCount
for i in range(cubeCount):
line = next(lines)
cubes[i] = tuple(islice(map(int, line.split()), 1, None))
return (numVars, tuple(cubes))
except Exception as error:
raise AssertionError("Bad pcn file {}".format(filePath)) from error
def write(f, numVars, cubes):
endl = "\n"
f.write(str(max(max(map(abs, cube)) for cube in cubes)))
f.write(endl)
f.write(str(len(cubes)))
f.write(endl)
cubes = tuple(set(tuple(sorted(cube, key=abs)) for cube in cubes))
for cube in cubes:
f.write(' '.join(map(str, chain((len(cube),), cube))))
f.write(endl)
f.write(endl)
Tuple assignment with a *star_target entry only works in Python 3. You cannot use it in Python 2. See PEP 3132 - Extended Iterable Unpacking.
As a workaround, just just one target then use slicing:
split_result = line.split()
command, args = split_result[0], split_result[1:]

Categories