I have 2 big text files (right now 17MB but could be GB), as such I don't want to load them in the ram because their size could exceed my ram capacity.
The code I wrote for now is this :
def stopIfFileExist(filename):
if os.path.isfile(filename):
raise Exception("%s already exist" %filename)
def compareDump(before_filename, after_filename, diff_filename):
Compare 2 dumps generated via makeDump(output_filename) and generate
a file containing the differences
-before_filename : (string) filename of the first dump
-after_filename : (string) filename of the second dump
-diff_filename : (string) filename of the diff
num_lines = sum(1 for line in open(after_filename))
one_percent = num_lines/float(100)
diff = []
start = time.time()
with open(after_filename, "r") as afterFile:
counter = 0
for a_line in afterFile:
print "completion : %.9f percents" %(counter/float(one_percent))
counter = counter + 1
with open(before_filename, "r") as beforeFile:
for b_line in beforeFile:
if a_line.rstrip() == b_line.rstrip():
end = time.time()
print "task completed in %s seconds" %(end - start)
with open(diff_filename, "a") as diffFile:
for line in diff:
what I'd like to do is remove from the beforeFile a line that was sucessfully compared (eg, when the if a_line.rstrip() == b_line.rstrip(): is triggered)
However since I am currently reading the file I don't see how to do it.
Any ideas?
I was able to diff two 20 megabyte files in a little over 3 minutes using the following test code.
Every 10,000 lines I put a random number, which you can see diff'd in the results.
import random
import difflib
import os
import time
start = time.time()
NUM_LINES = int(10000000 / 4)
t1 = 'test1'
t2 = 'test2'
if os.path.exists(t1):
if os.path.exists(t2):
with open(t1, 'w+') as f1:
for number in range(1, NUM_LINES):
if number % 10000 == 0:
r = random.randint(1, number)
r = 1
f1.write(str(number * r) + '\n')
with open(t2, 'w+') as f2:
for number in range(1, NUM_LINES):
if number % 10000 == 0:
r = random.randint(1, number)
r = 1
f2.write(str(number * r) + '\n')
t1 = f1.readlines()
t2 = f2.readlines()
for l in difflib.unified_diff(t1, t2, lineterm=''):
print('Execution took: {:.2f} seconds'.format(time.time() - start))
I pasted the output on github, as it is obscenely long.
I hope I am not downvoted this time. I have been struggling with parallel processing in Python for a while(2 days , exactly). I have checking these resources(a partial list is shown here:
(a) http://eli.thegreenplace.net/2013/01/16/python-paralellizing-cpu-bound-tasks-with-concurrent-futures
(b) https://pythonadventures.wordpress.com/tag/processpoolexecutor/
I came unstuck. What I want to do is this:
Break up the file into chunks(strings or numbers)
Broadcast a pattern to be searched to all the workers
Receive the offsets in the file where the pattern was found
Receive pattern and chunk of text from the master
Send back the offsets to the master.
I tried to implement this using MPI/concurrent.futures/multiprocessing and came unstuck.
My naive implementation using multiprocessing module
import multiprocessing
filename = "file1.txt"
pat = "afow"
N = 1000
""" This is the naive string search algorithm"""
def search(pat, txt):
patLen = len(pat)
txtLen = len(txt)
offsets = []
# A loop to slide pattern[] one by one
# Range generates numbers up to but not including that number
for i in range ((txtLen - patLen) + 1):
# Can not use a for loop here
# For loops in C with && statements must be
# converted to while statements in python
counter = 0
while(counter < patLen) and pat[counter] == txt[counter + i]:
counter += 1
if counter >= patLen:
return str(offsets).strip('[]')
This is what I want
if __name__ == "__main__":
tasks = []
pool_outputs = []
pool = multiprocessing.Pool(processes=5)
with open(filename, 'r') as infile:
lines = []
for line in infile:
if len(lines) > N:
pool_output = pool.map(search, tasks)
lines = []
if len(lines) > 0:
pool_output = pool.map(search, tasks)
print('Pool:', pool_outputs)
with open(filename, 'r') as infile:
for line in infile:
print(search(pat, line))
I would be grateful for any guidance especially with the concurrent.futures. Thanks for your time. Valeriy helped me with his addition and I thank him for that.
But if anyone could just indulge me for a moment, this is the code I was working on for the concurrent.futures(working off an example I saw somewhere)
from concurrent.futures import ProcessPoolExecutor, as_completed
import math
def search(pat, txt):
patLen = len(pat)
txtLen = len(txt)
offsets = []
# A loop to slide pattern[] one by one
# Range generates numbers up to but not including that number
for i in range ((txtLen - patLen) + 1):
# Can not use a for loop here
# For loops in C with && statements must be
# converted to while statements in python
counter = 0
while(counter < patLen) and pat[counter] == txt[counter + i]:
counter += 1
if counter >= patLen:
return str(offsets).strip('[]')
#Check a list of strings
def chunked_worker(lines):
return {0: search("fmo", line) for line in lines}
def pool_bruteforce(filename, nprocs):
lines = []
with open(filename) as f:
lines = [line.rstrip('\n') for line in f]
chunksize = int(math.ceil(len(lines) / float(nprocs)))
futures = []
with ProcessPoolExecutor() as executor:
for i in range(nprocs):
chunk = lines[(chunksize * i): (chunksize * (i + 1))]
futures.append(executor.submit(chunked_worker, chunk))
resultdict = {}
for f in as_completed(futures):
return resultdict
filename = "file1.txt"
pool_bruteforce(filename, 5)
Thanks again , Valeriy and anyone who attempts to help me solve my riddle.
You are using several arguments, so:
import multiprocessing
from functools import partial
filename = "file1.txt"
pat = "afow"
N = 1000
""" This is the naive string search algorithm"""
def search(pat, txt):
patLen = len(pat)
txtLen = len(txt)
offsets = []
# A loop to slide pattern[] one by one
# Range generates numbers up to but not including that number
for i in range ((txtLen - patLen) + 1):
# Can not use a for loop here
# For loops in C with && statements must be
# converted to while statements in python
counter = 0
while(counter < patLen) and pat[counter] == txt[counter + i]:
counter += 1
if counter >= patLen:
return str(offsets).strip('[]')
if __name__ == "__main__":
tasks = []
pool_outputs = []
pool = multiprocessing.Pool(processes=5)
lines = []
with open(filename, 'r') as infile:
for line in infile:
tasks = lines
func = partial(search, pat)
if len(lines) > N:
pool_output = pool.map(func, lines )
elif len(lines) > 0:
pool_output = pool.map(func, lines )
print('Pool:', pool_outputs)
I have a working python script that, in a simplified way, works as follows:
open("A", 'r')
open("B", 'r')
open("C", 'w')
for lineA in A:
part1, part2, part3 = lineA.split(' ')
for lineB in B:
if part2 in lineB:
I want to check in file B if a section of the line of file A exists there. If so, write that whole line from file B in a new file C.
The process is somewhat time consuming the way I have designed it (1-I still consider myself a noob with Python, 2-There are at least 4 IF statements running inside the main FOR loop), and now I have started to use input files around 200x larger than previously, so I am getting times of around 5 hours per input file here.
I have tried to use multiprocessing but I can't seem to get it to work.
I tried a simple code inside my main() function initially, without any significant improvement and definitely without using more than one CPU:
p = Process(target=multi_thread, args=(arg1,arg2,arg3))
Then I tried the jobs approach:
jobs = []
for i in range(4):
p = Process(target='myfunc')
And a pool example I found here in the forums, to which I added a Return statement to my main function:
def multiproc(arg1,arg2,arg3):
return lineB # example of Return statment
def main():
pool = Pool(4)
with open('file.txt', 'w') as map_file:
# chunk the work into batches of 4 lines at a time
results = pool.map(multi_thread, map_file, 4)
if __name__ == "__main__":
The jobs approach actually created the file and then restarted 3 more times the whole process from scratch. This last one gives me the following error:
io.UnsupportedOperation: not readable
And I also suppose that my Return statement is breaking my loop...
Any suggestions to enable multiprocessing for this piece of code, or also to improve its neatness?
As requested, here is the full messy code:
__author__ = 'daniel'
import os
import re
from multiprocessing import Process
from multiprocessing import Pool
import time
start_time = time.time()
def multi_thread(filePath, datasetFolder, mapFileDataset):
fout = open('outdude.txt', 'w')
cwd = os.getcwd()
cwdgen, sep, id = filePath.rpartition('/')
dataset = datasetFolder.rsplit("/",1)
dataset = dataset[1]
## Create file
for i in os.listdir(cwd):
if ".ped" in i:
sample_id, sep, rest = i.partition('.ped')
for i in os.listdir(cwd):
if sample_id+'.pileupgatk' in i and dataset in i:
pileup4map = open(i,'r')
snpcounter = sum(1 for _ in pileup4map)-1
mapout = open(sample_id+'.map', 'w')
counter = 1
for line in pileup4map:
if counter <= snpcounter:
mapFileData = open(datasetFolder+'/'+mapFileDataset,'r')
line = line.rstrip()
chro, coord, refb, rbase, qual = line.split(' ')
chrom = chro.strip("chr")
for ligna in mapFileData:
if coord in ligna:
k = re.compile(r'(?=%s )' % coord, re.I)
lookAhead = k.search(ligna)
k = re.compile(r'(?<= %s)' % coord, re.I)
lookBehind = k.search(ligna)
if lookAhead and lookBehind != None:
lignaChrom = ligna[:2].rstrip(' ')
if chrom == lignaChrom:
lignaOut = ligna.rstrip()
## For POOL
return lignaOut
def main():
# p = Process(target=multi_thread, args=('/home/full_karyo.fa', '/home/haak15', 'dataPP.map'))
# p.start()
# p.join()
# print("--- %s seconds ---" % (time.time() - start_time))
# jobs = []
# for i in range(4):
# p = Process(target=multi_thread, args=('/home/full_karyo.fa', '/home/haak15', 'dataPP.map'))
# jobs.append(p)
# p.start()
# p.join()
pool = Pool(4)
with open('file.txt', 'w') as map_file:
# chunk the work into batches of 4 lines at a time
results = pool.map(multi_thread, map_file, 4)
print("--- %s seconds ---" % (time.time() - start_time))
if __name__ == "__main__":
Following Robert E and TheBigC's advises I re-wrote my code and it is now 13x faster, and not as confusing. I used a dictionary approach that is not as I/O hungry as the previous one, as TheBigC pointed. I am happy enough with the speed so I will leave multiprocessing aside for now. Thanks for the comments!
if makemap == True:
## Dictionary method - 13X faster
for i in os.listdir(cwd):
if ".ped" in i:
sample_id, sep, rest = i.partition('.ped')
for i in os.listdir(cwd):
if sample_id+'.pileupgatk' in i and dataset in i:
print("\n\t> Creating MAP file from sample: "+sample_id)
pileup4map = open(i,'r')
snpcounter = sum(1 for _ in pileup4map)-1
counter = 1
piledic = {}
for line in pileup4map:
if counter <= snpcounter:
line = line.rstrip()
#chr21 43805965 G G G
chro, coord, refb, rbase, qual = line.split(' ')
chrom = chro.strip("chr")
counter += 1
mapFileData = open(datasetFolder+'/'+mapFileDataset,'r')
mapDic = {}
counterM =1
for ligna in mapFileData:
#22 Affx-19821577 0.737773 50950707 A G
chroMap,ident,prob,posMap,bas1,bas2 = ligna.split()
counterM +=1
listOfmatches = []
for item in piledic:
if item in mapDic:
mapWrite = open(sample_id+".map", 'w')
lineCounter = 1
for lignagain in mapFileData:
if lineCounter in listOfmatches:
lineCounter +=1
I'm writting a downloader that will split url to parts and download with threading, probably I will not use "join" because join = unable to stream (cannot write file if all thread is not finish)
But problem is f.seek and write output really weird file, content of the file always have "NUL" character (in Notepad++) and text in the file is just 1/3 of the whole file.
He everybody, thank everybody for helping me, here is my version 2.0 of the code, thank Padraic Cunningham for his suggestion and exlaination, I fix my code almost like what you've suggested:
So please help me check the code, and I think need you guy help to convert it to http.server file streamming method:
import os, requests
import threading
import urllib3
import urllib.request, urllib.error, urllib.parse
import time
import re
pool = urllib3.PoolManager(maxsize=10)
URL = "https://raw.githubusercontent.com/langpavel/tampermonkey/master/src/emulation.js"
fileName = "1.js"
countsize = 0
#if os.path.exists(fileName):
# os.remove(fileName)
def defwrite(filename,data,offset):
f = open(filename,'wb')
def buildRange(url, numsplits):
global pool
value = int(requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None))
print("Fullsize: ", value)
print("Try devide with 3 :", value / 3)
lst = []
for i in range(numsplits):
if i == range(numsplits):
lst.append('%s-%s' % (i * value//numsplits + 1, i * value//numsplits + 1 + (value - (i * value//numsplits + 1))))
if i == 0:
lst.append('%s-%s' % (0, value//numsplits))
lst.append('%s-%s' % (i * value//numsplits + 1, (i + 1) * value//numsplits))
return lst
def main(url=None, splitBy=3):
global fileName, pool, countsize
start_time = time.time()
if not url:
print("Please Enter some url to begin download.")
#fileName = "1.jpg"
#print("%s bytes to download." % sizeInBytes)
# if not sizeInBytes:
# print("Size cannot be determined.")
# return
#sinzeInBytes = buildRange(url,
dataDict = {}
f = open(fileName,'wb')
# split total num bytes into ranges
#ranges = buildRange(url,int(sizeInBytes), splitBy)
ranges = buildRange(url, splitBy)
def downloadChunk(idx, irange):
#req = urllib.request.Request(url)
#req.headers['Range'] = 'bytes={}'.format(irange)
headers = urllib3._collections.HTTPHeaderDict()
headers.add('Range', 'bytes=' + str(irange))
data = pool.urlopen('GET', URL, headers=headers).data
#print("finish: " + str(irange))
offset = int(re.sub("(^.*?)-(.*?)$", "\\1", irange))
# print(irange)
f.seek(offset, 0)
countsize = countsize + offset
#defwrite("1.txt", req, re.sub("(^.*?)-", "\\1", str(irange)))
# create one downloading thread per chunk
downloaders = [
args=(idx, irange),
for idx,irange in enumerate(ranges)
# start threads, let run in parallel, wait for all to finish
for th in downloaders:
#for th in downloaders:
#print('done: got {} chunks, total {} bytes'.format(
# len(dataDict), sum( (
## len(chunk) for chunk in list(dataDict.values())
# ) )
#print("--- %s seconds ---" % str(time.time() - start_time))
# if os.path.exists(fileName):
# os.remove(fileName)
#reassemble file in correct order
#with open(fileName, 'wb') as fh:
# for _idx,chunk in sorted(dataDict.items()):
# fh.write(chunk)
#stream_chunk = 16 * 1024
#with open(fileName, 'wb') as fp:
# while True:
# for _idx,chunk in sorted(dataDict.items()):
# chunking = chunk.read(stream_chunk)
# if not chunk:
# break
# fp.write(chunking)
# print("Finished Writing file %s" % fileName)
#print('file size {} bytes'.format(os.path.getsize(fileName)))
if __name__ == '__main__':
if os.path.exists(fileName):
main(URL, splitBy=16)
Here is my code, please help me fix it: Version 1.0, ignore it, version 2.0 above:
import os, requests
import threading
import urllib3
import urllib.request, urllib.error, urllib.parse
import time
import re
pool = urllib3.PoolManager(maxsize=10)
URL = "https://raw.githubusercontent.com/langpavel/tampermonkey/master/src/emulation.js"
fileName = "1.js"
#if os.path.exists(fileName):
# os.remove(fileName)
def defwrite(filename,data,offset):
f = open(filename,'wb')
def buildRange(value, numsplits):
lst = []
for i in range(numsplits):
if i == range(numsplits):
lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(value - round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
if i == 0:
lst.append('%s-%s' % (i, int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
return lst
def main(url=None, splitBy=3):
global fileName, pool
start_time = time.time()
if not url:
print("Please Enter some url to begin download.")
#fileName = "1.jpg"
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
print("%s bytes to download." % sizeInBytes)
if not sizeInBytes:
print("Size cannot be determined.")
dataDict = {}
# split total num bytes into ranges
ranges = buildRange(int(sizeInBytes), splitBy)
def downloadChunk(idx, irange):
#req = urllib.request.Request(url)
#req.headers['Range'] = 'bytes={}'.format(irange)
headers = urllib3._collections.HTTPHeaderDict()
headers.add('Range', 'bytes=' + str(irange))
data = pool.urlopen('GET', URL, headers=headers).data
print("finish: " + str(irange))
offset = int(re.sub("(^.*?)-(.*?)$", "\\1", irange))
# print(irange)
f = open(fileName,'wb')
#defwrite("1.txt", req, re.sub("(^.*?)-", "\\1", str(irange)))
# create one downloading thread per chunk
downloaders = [
args=(idx, irange),
for idx,irange in enumerate(ranges)
# start threads, let run in parallel, wait for all to finish
for th in downloaders:
#for th in downloaders:
#print('done: got {} chunks, total {} bytes'.format(
# len(dataDict), sum( (
## len(chunk) for chunk in list(dataDict.values())
# ) )
#print("--- %s seconds ---" % str(time.time() - start_time))
# if os.path.exists(fileName):
# os.remove(fileName)
#reassemble file in correct order
#with open(fileName, 'wb') as fh:
# for _idx,chunk in sorted(dataDict.items()):
# fh.write(chunk)
#stream_chunk = 16 * 1024
#with open(fileName, 'wb') as fp:
# while True:
# for _idx,chunk in sorted(dataDict.items()):
# chunking = chunk.read(stream_chunk)
# if not chunk:
# break
# fp.write(chunking)
# print("Finished Writing file %s" % fileName)
#print('file size {} bytes'.format(os.path.getsize(fileName)))
if __name__ == '__main__':
main(URL, splitBy=3)
You use three threads where your target function is downloadChunk, you open the file three times using wb which overwrites so you get 1/3 of the content. You also call seek for no apparent reason. If you wanted to append to a file you would open using a each time or just open the file once outside the functions.
You are trying to seek using an empty file and write so that is where the null bytes come from.
If you want to open a file for reading and writing so you can seek with line buffering:
with open("whatever.file", "r+b",buffering=1) as f
Then use that file to write to, don't keep opening in the function and overwriting, the file must also exist.
I put trailing print() methods right next to my write() method lines at the end of my code to test why my output files were incomplete. But, the print() output is "all the stuff" I expect; while the write() output is off by a confusing amount (only 150 out of 200 'things'). Reference Image of Output: IDLE versus external output file
FYI: Win 7 64 // Python 3.4.2
My modules take an SRT captions file ('test.srt') and returns a list object I create from it; in particular, one with 220 list entries of the form: [[(index), [time], string]]
times = open('times.txt', 'w')
### A portion of Riobard's SRT Parser: srt.py
import re
def tc2ms(tc):
''' convert timecode to millisecond '''
sign = 1
if tc[0] in "+-":
sign = -1 if tc[0] == "-" else 1
tc = tc[1:]
TIMECODE_RE = re.compile('(?:(?:(?:(\d?\d):)?(\d?\d):)?(\d?\d))?(?:[,.](\d?\d?\d))?')
match = TIMECODE_RE.match(tc)
assert match is not None
except AssertionError:
hh,mm,ss,ms = map(lambda x: 0 if x==None else int(x), match.groups())
return ((hh*3600 + mm*60 + ss) * 1000 + ms) * sign
# my code
with open('test.srt') as f:
file = f.read()
srt = []
for line in file:
splitter = file.split("\n\n")
# SRT splitter
i = 0
j = len(splitter)
for items in splitter:
while i <= j - 2:
split_point_1 = splitter[i].index("\n")
split_point_2 = splitter[i].index("\n", split_point_1 + 1)
index = splitter[i][:split_point_1]
time = [splitter[i][split_point_1:split_point_2]]
time = time[0][1:]
string = splitter[i][split_point_2:]
string = string[1:]
list = [[(index), [time], string]]
srt += list
i += 1
# time info outputter
i = 0
j = 1
for line in srt:
if i != len(srt) - 1:
indexer = srt[i][1][0].index(" --> ")
timein = srt[i][1][0][:indexer]
timeout = srt[i][1][0][-indexer:]
line_time = (tc2ms(timeout) - tc2ms(timein))/1000
space_time = ((tc2ms((srt[j][1][0][:indexer]))) - (tc2ms(srt[i][1][0][-indexer:])))/1000
out1 = "The space between Line " + str(i) + " and Line " + str(j) + " lasts " + str(space_time) + " seconds." + "\n"
out2 = "Line " + str(i) + ": " + str(srt[i][2]) + "\n\n"
print(out1, end="")
i += 1
j += 1
indexer = srt[i][1][0].index(" --> ")
timein = srt[i][1][0][:indexer]
timeout = srt[i][1][0][-indexer:]
line_time = (tc2ms(timeout) - tc2ms(timein))/1000
outend = "Line " + str(i) + ": " + str(srt[i][2]) + "\n<End of File>"
My two write() method output files, respectively, only print out either ~150 or ~200 items of the 220 things it otherwise correctly prints to the screen.
You want to close your times file when done writing; operating systems use write buffers to speed up file I/O, collecting larger blocks of data to be written to disk in one go; closing the file flushes that buffer:
Consider opening the file in a with block:
with open('times.txt', 'w') as times:
# all code that needs to write to times
Essentially what I am attempting to do is read 'n' number of lines from a file and then write them to a separate file. This program essentially should take a file that has 100 lines and separate that file into 50 separate files.
def main():
from itertools import islice
userfile = raw_input("Please enter the file you wish to open\n(must be in this directory): ")
file1 = open(userfile, "r+")
#print "Name: ", file1.name
#print "Closed or not", file1.closed
#print "Opening mode: ", file1.mode
#print "Softspace flag: ", file1.softspace
jcardtop = file1.read(221);
#print jcardtop
n = 2
count = 0
while True:
next_n_lines = list(islice(file1,n))
print next_n_lines
count = count + 1
fileout = open(str(count)+ ".txt", "w+")
if not next_n_lines:
I do have the file printing as well to show what is in the variable next_n_lines.
*['\n', "randomtext' more junk here\n"]
I would like it instead to look like
randomtext' more junk here
Is this a limitatoin of the islice function? Or am I missing a portion of the syntax?
Thanks for your time!
Where you call str() or print, you want to ''.join(next_n_lines) instead:
print ''.join(next_n_lines)
You can store the flattened string in a variable if you don't want to call join twice.
Did you mean something like this?
f = open(userfile,"r")
start = 4
n_lines = 100
for line in f.readlines()[start:(start + n_lines)]:
print line
#do stuff with line
or maybe this rough, yet effective code:
f = open(userfile,"r")
start = 4
end = start + 100
count = start
while count != end:
for line in f.readlines()[count:(count + 2)]:
fileout = open(str(count)+ ".txt", "w+")
count = count + 2