Python multiprocessing: Reading a large file and updating an imported dictionary - python

I need to read a large file and update an imported dictionary accordingly, using multiprocessing Pool and Manager. Here is my code:
from multiprocessing import Pool, Manager
manager = Manager()
d = manager.dict()
imported_dic = json.load(~/file.json) #loading a file containing a large dictionary
d.update(imported_dic)
def f(line):
data = line.split('\t')
uid = data[0]
tweet = data[2].decode('utf-8')
if #sth in tweet:
d[uid] += 1
p = Pool(4)
with open('~/test_1k.txt') as source_file:
p.map(f, source_file)
But it does not work properly. Any idea what am I doing wrong here?

Try this code:
d = init_dictionary( ) # some your magic here
def f(line):
data = line.split('\t')
uid = data[0]
tweet = data[2].decode('utf-8')
if uid in d:
for n in d[uid].keys():
if n in tweet:
yield uid, n, 1
else:
yield uid, n, 0
p = Pool(4)
with open('~/test_1k.txt') as source_file:
for stat in p.map(f, source_file):
uid, n, r = stat
d[uid][n] += r
It's same solution, but without shared dictionary.

Related

Efficiently Create 1% Sample Using Multiprocessing in Python

I'm trying to process a large dataset (300GB) line by line using multiprocessing. I want to define a 1% random sample based one variable. My first step is to define the sample and then I want to read the data file using multiprocessing. I'm guessing that the script will run faster if the part where I define the set used for the random sample isn't run for each child. However, when I try to move that part of the script under the line if __name__ == "__main__": The child programs no longer seem to recognize random sample from the parent. I get the error:
NameError: name 'id_pct1' is not defined
Where is the most efficient place to put the portion of the script where I define the random sample?
#define sample
uid = list(line.strip() for line in open('Subsets/unique_ids_final.txt'))
pct1 = round(len(uid)/100)
random.seed(1)
id_pct1 = set(random.sample(uid, k=pct1))
id_pct1.add(vname)
#read original file and write 1% sample using multiprocessing
def worker(chunkStart, chunkSize, q):
with open('myfile.txt') as f:
tlines = []
f.seek(chunkStart)
lines = f.read(chunkSize).splitlines()
for line in lines:
data = line.split('*')
if data[30] in id_pct1: tlines.append(line)
q.put(tlines)
return tlines
def chunkify(fname,size=1024*1024):
fileEnd = os.path.getsize(fname)
with open(fname, 'r') as f:
chunkEnd2 = 0
while True:
chunkStart = chunkEnd2
f.seek(chunkStart)
f.read(size)
chunkEnd1 = f.tell()
f.readline()
chunkEnd2 = f.tell()
chunkSz = 1024*1024 + chunkEnd2 - chunkEnd1 - 1
yield chunkStart, chunkSz
if chunkEnd2 >= fileEnd:
break
def listener(q):
with open('myfile1pct.txt', 'w') as out_f1:
while True:
m = q.get()
if m == 'kill': break
else:
for line in m:
out_f1.write(line+'\n')
out_f1.flush()
def main():
manager = mp.Manager()
q = manager.Queue()
pool = mp.Pool()
watcher = pool.apply_async(listener, (q,))
jobs = []
for chunkStart, chunkSize in chunkify('myfile.txt'):
jobs.append(pool.apply_async(worker,(chunkStart,chunkSize,q)))
for job in jobs:
job.get()
q.put('kill')
pool.close()
pool.join()
if __name__ == '__main__':
main()
If you want those items in #define to be available throughout the entire program, you should use the keyword global in front of it. However, declaring global variables is generally considered bad practice. You should consider just incorporating your #define logic in your functions like so:
#read original file and write 1% sample using multiprocessing
def worker(chunkStart, chunkSize, q):
#define sample
uid = list(line.strip() for line in open('Subsets/unique_ids_final.txt'))
pct1 = round(len(uid)/100)
random.seed(1)
id_pct1 = set(random.sample(uid, k=pct1))
id_pct1.add(vname)
with open('myfile.txt') as f:
tlines = []
f.seek(chunkStart)
lines = f.read(chunkSize).splitlines()
for line in lines:
data = line.split('*')
if data[30] in id_pct1: tlines.append(line)
q.put(tlines)
return tlines
def chunkify(fname,size=1024*1024):
fileEnd = os.path.getsize(fname)
with open(fname, 'r') as f:
chunkEnd2 = 0
while True:
chunkStart = chunkEnd2
f.seek(chunkStart)
f.read(size)
chunkEnd1 = f.tell()
f.readline()
chunkEnd2 = f.tell()
chunkSz = 1024*1024 + chunkEnd2 - chunkEnd1 - 1
yield chunkStart, chunkSz
if chunkEnd2 >= fileEnd:
break
def listener(q):
with open('myfile1pct.txt', 'w') as out_f1:
while True:
m = q.get()
if m == 'kill': break
else:
for line in m:
out_f1.write(line+'\n')
out_f1.flush()
def main():
manager = mp.Manager()
q = manager.Queue()
pool = mp.Pool()
watcher = pool.apply_async(listener, (q,))
jobs = []
for chunkStart, chunkSize in chunkify('myfile.txt'):
jobs.append(pool.apply_async(worker,(chunkStart,chunkSize,q)))
for job in jobs:
job.get()
q.put('kill')
pool.close()
pool.join()
if __name__ == '__main__':
main()

Writing a JSON file using multiprocessing - Python

I have a JSON that has some values that must be extracted, processed and with those, add new values to the file. To do this, I use multiprocessing and although it is a priori synchronized I got race conditions.
The function that is called just transforms a rating between a range of 0-5 to a range of 0-100.
Thanks in advance!
import json
import multiprocessing
n = 1000
maxRating = 5
percentage = 100
inputfile= 'rawData.JSON'
outfile= 'processedData.JSON'
#load data into dictionary "data"
with open(inputfile) as f:
data = json.load(f)
#create an empty dictionary that will contain the new informations
results = {}
def saver(init,end,q,l):
for num in range(init, end):
l.acquire()
rating = data["bars"][num]["rating"]
ratioRating = (percentage * rating) / maxRating
results["ratingP"] = ratioRating
print(ratioRating)
#put data in queue
q.put(results)
l.release()
#main function
if __name__ == '__main__':
i = 0
cores = 4
q = multiprocessing.Queue()
lock = multiprocessing.Lock()
if(cores > 1): #parallel
for i in range (cores):
init = (i*n)/cores
fin = ((i+1)*n)/cores
p = multiprocessing.Process(target = saver, args = (init,fin,q,lock)).start()
for i in range (n):
data["bars"][i].update(q.get()) #update "data" dictionary adding new processed data
else: #sequential
saver(0,n,q)
for l in range (n):
data["bars"][l].update(q.get()) #update "data" dictionary adding new processed data
#write the updated JSON file with the added processed data
with open(outfile,'w') as outfile:
json.dump(data,outfile)

Python Code Speed Up

My code should compare two vectors saved as dictionary (two pickle files) and save the result into a pickle file too. This works but very slowly. For one compare result I'm waiting about 7:2o min. Because I have a lot of videos (exactly 2033) this prog will run about 10 days. This is too long. How can I speed up my code for Python 2.7?
import math
import csv
import pickle
from itertools import izip
global_ddc_file = 'E:/global_ddc.p'
io = 'E:/AV-Datensatz'
v_source = ''
def dot_product(v1, v2):
return sum(map(lambda x: x[0] * x[1], izip(v1, v2))) # izip('ABCD', 'xy') --> Ax By
def cosine_measure(v1, v2):
prod = dot_product(v1, v2)
len1 = math.sqrt(dot_product(v1, v1))
len2 = math.sqrt(dot_product(v2, v2))
if (len1 * len2) <> 0:
out = prod / (len1 * len2)
else: out = 0
return out
def findSource(v):
v_id = "/"+v[0].lstrip("<http://av.tib.eu/resource/video").rstrip(">")
v_source = io + v_id
v_file = v_source + '/vector.p'
source = [v_id, v_source, v_file]
return source
def getVector(v, vectorCol):
with open (v, 'rb') as f:
try:
vector_v = pickle.load(f)
except: print 'file couldnt be loaded'
tf_idf = []
tf_idf = [vec[1][vectorCol] for vec in vector_v]
return tf_idf
def compareVectors(v1, v2, vectorCol):
v1_source = findSource(v1)
v2_source = findSource(v2)
V1 = getVector(v1_source[2], vectorCol)
V2 = getVector(v2_source[2], vectorCol)
sim = [v1_source[0], v2_source[0], cosine_measure(V1, V2)]
return sim
#with open('videos_av_portal_cc_3.0_nur2bspStanford.csv', 'rb') as dataIn:
with open('videos_av_portal_cc_3.0_vollstaendig.csv', 'rb') as dataIn:
#with open('videos_av_portal_cc_3.0.csv', 'rb') as dataIn:
try:
reader = csv.reader(dataIn)
v_source = []
for row in reader:
v_source.append(findSource(row))
#print v_source
for one in v_source:
print one[1]
compVec = []
for another in v_source:
if one <> another:
compVec.append(compareVectors(one, another, 3))
compVec_sort = sorted(compVec, key=lambda cosim: cosim[2], reverse = True)
# save vector file for each video
with open (one[1] + '/compare.p','wb') as f:
pickle.dump(compVec_sort,f)
finally:
dataIn.close()
Split code in 2 parts:
1. Load Dictionary in vectors
2. Compare 2 dictionaries using multiprocessmultiprocess example
3. Launch process simultaneously according to memory availability and end the process after 8 mins. Then update the 3rd dictionary.
4. Then relaunch process on next set of data , follow step 3 and continue till the dictionary length.
This should reduce total turnaround time.
Let me know if you need code .

Better examples of Parallel processing in Python

I hope I am not downvoted this time. I have been struggling with parallel processing in Python for a while(2 days , exactly). I have checking these resources(a partial list is shown here:
(a) http://eli.thegreenplace.net/2013/01/16/python-paralellizing-cpu-bound-tasks-with-concurrent-futures
(b) https://pythonadventures.wordpress.com/tag/processpoolexecutor/
I came unstuck. What I want to do is this:
Master:
Break up the file into chunks(strings or numbers)
Broadcast a pattern to be searched to all the workers
Receive the offsets in the file where the pattern was found
Workers:
Receive pattern and chunk of text from the master
Compute()
Send back the offsets to the master.
I tried to implement this using MPI/concurrent.futures/multiprocessing and came unstuck.
My naive implementation using multiprocessing module
import multiprocessing
filename = "file1.txt"
pat = "afow"
N = 1000
""" This is the naive string search algorithm"""
def search(pat, txt):
patLen = len(pat)
txtLen = len(txt)
offsets = []
# A loop to slide pattern[] one by one
# Range generates numbers up to but not including that number
for i in range ((txtLen - patLen) + 1):
# Can not use a for loop here
# For loops in C with && statements must be
# converted to while statements in python
counter = 0
while(counter < patLen) and pat[counter] == txt[counter + i]:
counter += 1
if counter >= patLen:
offsets.append(i)
return str(offsets).strip('[]')
""""
This is what I want
if __name__ == "__main__":
tasks = []
pool_outputs = []
pool = multiprocessing.Pool(processes=5)
with open(filename, 'r') as infile:
lines = []
for line in infile:
lines.append(line.rstrip())
if len(lines) > N:
pool_output = pool.map(search, tasks)
pool_outputs.append(pool_output)
lines = []
if len(lines) > 0:
pool_output = pool.map(search, tasks)
pool_outputs.append(pool_output)
pool.close()
pool.join()
print('Pool:', pool_outputs)
"""""
with open(filename, 'r') as infile:
for line in infile:
print(search(pat, line))
I would be grateful for any guidance especially with the concurrent.futures. Thanks for your time. Valeriy helped me with his addition and I thank him for that.
But if anyone could just indulge me for a moment, this is the code I was working on for the concurrent.futures(working off an example I saw somewhere)
from concurrent.futures import ProcessPoolExecutor, as_completed
import math
def search(pat, txt):
patLen = len(pat)
txtLen = len(txt)
offsets = []
# A loop to slide pattern[] one by one
# Range generates numbers up to but not including that number
for i in range ((txtLen - patLen) + 1):
# Can not use a for loop here
# For loops in C with && statements must be
# converted to while statements in python
counter = 0
while(counter < patLen) and pat[counter] == txt[counter + i]:
counter += 1
if counter >= patLen:
offsets.append(i)
return str(offsets).strip('[]')
#Check a list of strings
def chunked_worker(lines):
return {0: search("fmo", line) for line in lines}
def pool_bruteforce(filename, nprocs):
lines = []
with open(filename) as f:
lines = [line.rstrip('\n') for line in f]
chunksize = int(math.ceil(len(lines) / float(nprocs)))
futures = []
with ProcessPoolExecutor() as executor:
for i in range(nprocs):
chunk = lines[(chunksize * i): (chunksize * (i + 1))]
futures.append(executor.submit(chunked_worker, chunk))
resultdict = {}
for f in as_completed(futures):
resultdict.update(f.result())
return resultdict
filename = "file1.txt"
pool_bruteforce(filename, 5)
Thanks again , Valeriy and anyone who attempts to help me solve my riddle.
You are using several arguments, so:
import multiprocessing
from functools import partial
filename = "file1.txt"
pat = "afow"
N = 1000
""" This is the naive string search algorithm"""
def search(pat, txt):
patLen = len(pat)
txtLen = len(txt)
offsets = []
# A loop to slide pattern[] one by one
# Range generates numbers up to but not including that number
for i in range ((txtLen - patLen) + 1):
# Can not use a for loop here
# For loops in C with && statements must be
# converted to while statements in python
counter = 0
while(counter < patLen) and pat[counter] == txt[counter + i]:
counter += 1
if counter >= patLen:
offsets.append(i)
return str(offsets).strip('[]')
if __name__ == "__main__":
tasks = []
pool_outputs = []
pool = multiprocessing.Pool(processes=5)
lines = []
with open(filename, 'r') as infile:
for line in infile:
lines.append(line.rstrip())
tasks = lines
func = partial(search, pat)
if len(lines) > N:
pool_output = pool.map(func, lines )
pool_outputs.append(pool_output)
elif len(lines) > 0:
pool_output = pool.map(func, lines )
pool_outputs.append(pool_output)
pool.close()
pool.join()
print('Pool:', pool_outputs)

Python I/O multiprocessing with no Return on function

I have a working python script that, in a simplified way, works as follows:
open("A", 'r')
open("B", 'r')
open("C", 'w')
for lineA in A:
part1, part2, part3 = lineA.split(' ')
for lineB in B:
if part2 in lineB:
C.write(lineB)
I want to check in file B if a section of the line of file A exists there. If so, write that whole line from file B in a new file C.
The process is somewhat time consuming the way I have designed it (1-I still consider myself a noob with Python, 2-There are at least 4 IF statements running inside the main FOR loop), and now I have started to use input files around 200x larger than previously, so I am getting times of around 5 hours per input file here.
I have tried to use multiprocessing but I can't seem to get it to work.
I tried a simple code inside my main() function initially, without any significant improvement and definitely without using more than one CPU:
p = Process(target=multi_thread, args=(arg1,arg2,arg3))
p.start()
p.join()
Then I tried the jobs approach:
jobs = []
for i in range(4):
p = Process(target='myfunc')
jobs.append(p)
p.start()
p.join()
And a pool example I found here in the forums, to which I added a Return statement to my main function:
def multiproc(arg1,arg2,arg3):
(...)
return lineB # example of Return statment
def main():
pool = Pool(4)
with open('file.txt', 'w') as map_file:
# chunk the work into batches of 4 lines at a time
results = pool.map(multi_thread, map_file, 4)
if __name__ == "__main__":
main()
The jobs approach actually created the file and then restarted 3 more times the whole process from scratch. This last one gives me the following error:
io.UnsupportedOperation: not readable
And I also suppose that my Return statement is breaking my loop...
Any suggestions to enable multiprocessing for this piece of code, or also to improve its neatness?
Thanks!
EDIT:
As requested, here is the full messy code:
#!/usr/bin/python3
__author__ = 'daniel'
import os
import re
from multiprocessing import Process
from multiprocessing import Pool
import time
start_time = time.time()
def multi_thread(filePath, datasetFolder, mapFileDataset):
fout = open('outdude.txt', 'w')
cwd = os.getcwd()
cwdgen, sep, id = filePath.rpartition('/')
dataset = datasetFolder.rsplit("/",1)
dataset = dataset[1]
## Create file
for i in os.listdir(cwd):
if ".ped" in i:
sample_id, sep, rest = i.partition('.ped')
for i in os.listdir(cwd):
if sample_id+'.pileupgatk' in i and dataset in i:
pileup4map = open(i,'r')
snpcounter = sum(1 for _ in pileup4map)-1
pileup4map.seek(0)
mapout = open(sample_id+'.map', 'w')
counter = 1
for line in pileup4map:
if counter <= snpcounter:
mapFileData = open(datasetFolder+'/'+mapFileDataset,'r')
line = line.rstrip()
chro, coord, refb, rbase, qual = line.split(' ')
chrom = chro.strip("chr")
counter+=1
for ligna in mapFileData:
if coord in ligna:
k = re.compile(r'(?=%s )' % coord, re.I)
lookAhead = k.search(ligna)
k = re.compile(r'(?<= %s)' % coord, re.I)
lookBehind = k.search(ligna)
if lookAhead and lookBehind != None:
lignaChrom = ligna[:2].rstrip(' ')
if chrom == lignaChrom:
lignaOut = ligna.rstrip()
mapout.write(lignaOut+'\n')
## For POOL
return lignaOut
else:
pass
else:
pass
else:
pass
mapout.close()
def main():
#Multiproc
# p = Process(target=multi_thread, args=('/home/full_karyo.fa', '/home/haak15', 'dataPP.map'))
# p.start()
# p.join()
# print("--- %s seconds ---" % (time.time() - start_time))
#Jobs
# jobs = []
# for i in range(4):
# p = Process(target=multi_thread, args=('/home/full_karyo.fa', '/home/haak15', 'dataPP.map'))
# jobs.append(p)
# p.start()
# p.join()
#Pool
pool = Pool(4)
with open('file.txt', 'w') as map_file:
# chunk the work into batches of 4 lines at a time
results = pool.map(multi_thread, map_file, 4)
print(results)
print("--- %s seconds ---" % (time.time() - start_time))
if __name__ == "__main__":
main()
EDIT2:
Following Robert E and TheBigC's advises I re-wrote my code and it is now 13x faster, and not as confusing. I used a dictionary approach that is not as I/O hungry as the previous one, as TheBigC pointed. I am happy enough with the speed so I will leave multiprocessing aside for now. Thanks for the comments!
if makemap == True:
## Dictionary method - 13X faster
for i in os.listdir(cwd):
if ".ped" in i:
sample_id, sep, rest = i.partition('.ped')
for i in os.listdir(cwd):
if sample_id+'.pileupgatk' in i and dataset in i:
print("\n\t> Creating MAP file from sample: "+sample_id)
pileup4map = open(i,'r')
snpcounter = sum(1 for _ in pileup4map)-1
pileup4map.seek(0)
counter = 1
piledic = {}
for line in pileup4map:
if counter <= snpcounter:
line = line.rstrip()
#chr21 43805965 G G G
chro, coord, refb, rbase, qual = line.split(' ')
chrom = chro.strip("chr")
piledic[chrom,coord]=int(counter)
counter += 1
pileup4map.close()
mapFileData = open(datasetFolder+'/'+mapFileDataset,'r')
mapDic = {}
counterM =1
for ligna in mapFileData:
#22 Affx-19821577 0.737773 50950707 A G
chroMap,ident,prob,posMap,bas1,bas2 = ligna.split()
mapDic[chroMap,posMap]=int(counterM)
counterM +=1
listOfmatches = []
for item in piledic:
if item in mapDic:
listOfmatches.append(mapDic[item])
listOfmatches.sort()
mapWrite = open(sample_id+".map", 'w')
mapFileData.seek(0)
lineCounter = 1
for lignagain in mapFileData:
if lineCounter in listOfmatches:
mapWrite.write(lignagain)
lineCounter +=1
mapWrite.close()
mapFileData.close()

Categories