I'm trying to merge multiple files into one file using python, I've tried several methods but they all result in the final file missing out on some lines. The size of the file can vary a lot, so I'd prefer using something which does not load the whole file into memory.
My knowledge on this is a bit limited but I read that it's probably due to the writing buffering aka, the file is not immediately written, the information is instead kept momentarily in memory and later written to the file.
I've tried multiple ways to solve this: using shutil.copyfileobj, classical python's read/write , adding a tag to the end of the file, checking the tail of both files, using file.flush followed by os.fsync, and finally, adding a few seconds of time.sleep. Everything fails, could anyone advise on an infallible way to merge files?
Some approaches seem to work fine on my local PC, but when tried on an another system (HPC) the error occurs, so this is kinda hard to replicate.
these are all the approaches I tried so far:
#support functions
def tail(file_path):
last_line = None
with open(file_path) as file:
line=file.readline()
while line:
last_line=str(line)
line=file.readline()
return last_line
def wait_for_flush(output_file,tail_in):
c = 0
while not file_exists(output_file):
sleep(5)
c += 1
if c > 100: raise BrokenConcatenation(output_file)
tail_out = tail(output_file)
while tail_out != tail_in:
while not tail_out:
sleep(2)
tail_out = tail(output_file)
c += 1
if c > 100: raise BrokenConcatenation(output_file)
tail_out = tail(output_file)
c += 1
sleep(2)
if c > 100: raise BrokenConcatenation(output_file)
def merge_two_files(file1,file2):
with open(file1, 'a+') as f1:
with open(file2) as f2:
line=f2.readline()
while line:
f1.write(line)
line=f2.readline()
#forcing disk write
f1.flush()
os.fsync(f1)
#main functions
def concat_files(output_file,list_file_paths,stdout_file=None,add_tag=False):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
print(output_file)
list_files=list(list_file_paths)
while len(list_files)>1:
file1=list_files.pop(0)
file2=list_files.pop(0)
merge_two_files(file1,file2)
sleep(1)
os.remove(file2)
list_files.append(file1)
final_file=list_files.pop()
move_file(final_file,output_file)
def concat_files(output_file,list_file_paths,stdout_file=None,add_tag=False):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
with open(output_file, 'wb',buffering=0) as wfd:
for f in list_file_paths:
with open(f,'rb') as fd:
shutil.copyfileobj(fd, wfd)
#forcing disk write
wfd.flush()
os.fsync(wfd)
sleep(2)
def concat_files(output_file,list_file_paths,stdout_file=None,add_tag=False):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
with open(output_file, 'w+') as wfd:
for f in list_file_paths:
with open(f) as fd:
line = fd.readline()
while line:
wfd.write(line)
line = fd.readline()
if add_tag:
tail_in='#'+f+'\n'
wfd.write(tail_in)
else: tail_in=tail(f)
# forcing disk write
wfd.flush()
os.fsync(wfd)
wait_for_flush(output_file,tail_in)
#resets file whenever we open file, doesnt work
def concat_files(output_file,list_file_paths,stdout_file=None):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
for f in list_file_paths:
with open(output_file, 'wb') as wfd:
with open(f,'rb') as fd:
shutil.copyfileobj(fd, wfd)
#forcing disk write
wfd.flush()
os.fsync(wfd)
def concat_files(output_file,list_file_paths,stdout_file=None):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
with open(output_file, 'w+') as outfile:
for f in list_file_paths:
with open(f) as infile:
line=infile.readline()
while line:
outfile.write(line)
line=infile.readline()
#forcing disk write
outfile.flush()
os.fsync(outfile)
def concat_files(output_file,list_file_paths,stdout_file=None):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
with open(output_file, 'wb') as wfd:
for f in list_file_paths:
with open(f,'rb') as fd:
shutil.copyfileobj(fd, wfd)
#forcing disk write
wfd.flush()
os.fsync(wfd)
If you don't want to read large files into memory, I would say this should just work:
def concat_files(output_file, list_file_paths):
print('Concatenating files into', output_file)
with open(output_file, 'w') as wfd:
for f in list_file_paths:
print(f, '...')
with open(f) as fd:
for line in fd:
wfd.write(line)
wfd.write(f'eof - {f}\n') # mod to indicate end of this file
print('Done.')
This should create the output_file as a new file and read each file from list_file_paths, a line at a time, writing to the new file.
Update: see mod to indicate end of this file
Related
I am trying to read a number from a txt file. then increase it by 1 then write that number to the file but it just empties the file.
How do I fix this?
Code:
f = open('BestillingNr.txt', 'r')
bestillingNr = int(f.read())
bestillingNr += 1
f2 = open('BestillingNr.txt', 'w')
f2.write(f'{str(bestillingNr)}')
f.close()
f2.close
You need to close the second file. You were missing the () at the end of f2.close so the close method actually won't have been executed.
In the example below, I am using with which creates a context manager to automatically close the file.
with open('BestillingNr.txt', 'r') as f:
bestillingNr = int(f.read())
bestillingNr += 1
with open('BestillingNr.txt', 'w') as f2:
f2.write(f'{str(bestillingNr)}')
after this line f2.write(f'{str(bestillingNr)}'), you should add flush command f2.flush()
This code is work well:
f = open('BestillingNr.txt', 'r')
bestillingNr = int(f.read())
f.close()
bestillingNr += 1
f2 = open('BestillingNr.txt', 'w')
f2.write(f'{str(bestillingNr)}')
f2.flush()
f2.close()
I'm reading the file in my HDFS using Python language.
Each file has a header and I'm trying to merge the files. However, the header in each file also gets merged.
Is there a way to skip the header from second file?
hadoop = sc._jvm.org.apache.hadoop
conf = hadoop.conf.Configuration()
fs = hadoop.fs.FileSystem.get(conf)
src_dir = "/mnt/test/"
out_stream = fs.create(hadoop.fs.Path(dst_file), overwrite)
files = []
for f in fs.listStatus(hadoop.fs.Path(src_dir)):
if f.isFile():
files.append(f.getPath())
for file in files:
in_stream = fs.open(file)
hadoop.io.IOUtils.copyBytes(in_stream, out_stream, conf, False)
Currently I have solved the problem with below logic, however would like to know if there is any better and efficient solution? appreciate your help
for idx,file in enumerate(files):
if debug:
print("Appending file {} into {}".format(file, dst_file))
# remove header from the second file
if idx>0:
file_str = ""
with open('/'+str(file).replace(':',''),'r+') as f:
for idx,line in enumerate(f):
if idx>0:
file_str = file_str + line
with open('/'+str(file).replace(':',''), "w+") as f:
f.write(file_str)
in_stream = fs.open(file) # InputStream object and copy the stream
try:
hadoop.io.IOUtils.copyBytes(in_stream, out_stream, conf, False) # False means don't close out_stream
finally:
in_stream.close()
What you are doing now is appending repeatedly to a string. This is a fairly slow process. Why not write directly to the output file as you are reading?
for file_idx, file in enumerate(files):
with open(...) as out_f, open(...) as in_f:
for line_num, line in enumerate(in_f):
if file_idx == 0 or line_num > 0:
f_out.write(line)
If you can load the file all at once, you can also skip the first line by using readline followed by readlines:
for file_idx, file in enumerate(files):
with open(...) as out_f, open(...) as in_f:
if file_idx != 0:
f_in.readline()
f_out.writelines(f_in.readlines())
I am trying to process several files into a single, merged csv file using python. So far, I have
files = ["file1.txt", "file2.txt", "file3.txt"]
def doSomething(oldfile):
content = []
with open oldfile as file:
content = file.read().splitlines()
file.close()
return content.reverse()
with open("newfile.txt", "w") as file:
w = csv.writer(file, dialect = "excel-tab")
for i in range(0, len(files)):
w. writerows(doSomething(files[i])
file.close()
The new file is being created, but there is nothing in it. I am curious about what is going on.
Thanks!
For starters, list.reverse() reverses the list in place and doesn't return anything so you're essentially returning None from your doSomething() function. You'll actually want to split that into two lines:
content.reverse()
return content
If you want to streamline your code, here's a suggestion:
def doSomething(oldfile):
with open(oldfile, "r") as f:
return reversed(f.read().splitlines())
files = ["file1.txt", "file2.txt", "file3.txt"]
with open("newfile.txt", "wb") as file:
w = csv.writer(file, dialect = "excel-tab")
for current_file in files:
w.writerows(doSomething(current_file))
I think your program crashes for several reasons:
open(..) is a function, so you cannot write:
with open oldfile as file:
a with statement for files is used to enforce closing of a file, so file.close() is actually not necessary.
.reverse() works inplace: it returns None, you can use reversed(..) for that.
You can fix it with:
files = ["file1.txt", "file2.txt", "file3.txt"]
def doSomething(oldfile):
content = []
with open(oldfile,'r') as file:
return list(reversed(file))
with open("newfile.txt", "w") as file:
w = csv.writer(file, dialect = "excel-tab")
for oldfile in files:
w.writerows(doSomething(oldfile))
I also used a for loop over the list, instead of the indices, since that is more "pythonic". Furthermore a file is iterable over its rows. So one can use reversed(file) to obtain the lines of the file in reverse.
The idea is to write N files using N processes.
The data for the file to be written are coming from multiple files which are stored on a dictionary that has a list as a value and it looks like this:
dic = {'file1':['data11.txt', 'data12.txt', ..., 'data1M.txt'],
'file2':['data21.txt', 'data22.txt', ..., 'data2M.txt'],
...
'fileN':['dataN1.txt', 'dataN2.txt', ..., 'dataNM.txt']}
so file1 is data11 + data12 + ... + data1M etc...
So my code looks like this:
jobs = []
for d in dic:
outfile = str(d)+"_merged.txt"
with open(outfile, 'w') as out:
p = multiprocessing.Process(target = merger.merger, args=(dic[d], name, out))
jobs.append(p)
p.start()
out.close()
and the merger.py looks like this:
def merger(files, name, outfile):
time.sleep(2)
sys.stdout.write("Merging %n...\n" % name)
# the reason for this step is that all the different files have a header
# but I only need the header from the first file.
with open(files[0], 'r') as infile:
for line in infile:
print "writing to outfile: ", name, line
outfile.write(line)
for f in files[1:]:
with open(f, 'r') as infile:
next(infile) # skip first line
for line in infile:
outfile.write(line)
sys.stdout.write("Done with: %s\n" % name)
I do see the file written on the folder it should go to, but it's empty. no header, no nothing. I had put prints in there to see if everything is correct but nothing works.
Help!
Since the worker processes run in parallel to the main process creating them, the files named out get closed before the workers can write to them. This will happen even if you remove out.close() because of the with statement. Rather pass each process the filename and let the process open and close the file.
The problem is that you don't close the file in the child so internally buffered data is lost. You could move the file open to the child or wrap the whole thing in a try/finally block to make sure the file closes. A potential advantage of opening in the parent is that you can handle file errors there. I'm not saying its compelling, just an option.
def merger(files, name, outfile):
try:
time.sleep(2)
sys.stdout.write("Merging %n...\n" % name)
# the reason for this step is that all the different files have a header
# but I only need the header from the first file.
with open(files[0], 'r') as infile:
for line in infile:
print "writing to outfile: ", name, line
outfile.write(line)
for f in files[1:]:
with open(f, 'r') as infile:
next(infile) # skip first line
for line in infile:
outfile.write(line)
sys.stdout.write("Done with: %s\n" % name)
finally:
outfile.close()
UPDATE
There has been some confusion about parent/child file decriptors and what happens to files in the child. The underlying C library does not flush data to disk if a file is still open when the program exits. The theory is that a properly running program closes things before exit. Here is an example where the child loses data because it does not close the file.
import multiprocessing as mp
import os
import time
if os.path.exists('mytestfile.txt'):
os.remove('mytestfile.txt')
def worker(f, do_close=False):
time.sleep(2)
print('writing')
f.write("this is data")
if do_close:
print("closing")
f.close()
print('without close')
f = open('mytestfile.txt', 'w')
p = mp.Process(target=worker, args=(f, False))
p.start()
f.close()
p.join()
print('file data:', open('mytestfile.txt').read())
print('with close')
os.remove('mytestfile.txt')
f = open('mytestfile.txt', 'w')
p = mp.Process(target=worker, args=(f, True))
p.start()
f.close()
p.join()
print('file data:', open('mytestfile.txt').read())
I run it on linux and I get
without close
writing
file data:
with close
writing
closing
file data: this is data
I'm using python code to read from many csv files and set encoding to utf8.I meet the problem when I read the file I can read all lines but when I write it, it can write only 1 line. Please help me to check my code as below:
def convert_files(files, ascii, to="utf-8"):
for name in files:
#print ("Convert {0} from {1} to {2}").format(name, ascii, to)
with open(name) as f:
print(name)
count = 0
lineno = 0
#this point I want to write the below text into my each new file at the first line
#file_source.write('id;nom;prenom;nom_pere;nom_mere;prenom_pere;prenom_mere;civilite (1=homme 2=f);date_naissance;arrondissement;adresse;ville;code_postal;pays;telephone;email;civilite_demandeur (1=homme 2=f);nom_demandeur;prenom_demandeur;qualite_demandeur;type_acte;nombre_actes\n')
for line in f.readlines():
lineno +=1
if lineno == 1 :
continue
file_source = open(name, mode='w', encoding='utf-8', errors='ignore')
#pass
#print (line)
# start write data to to new file with encode
file_source.write(line)
#file_source.close
#print unicode(line, "cp866").encode("utf-8")
csv_files = find_csv_filenames('./csv', ".csv")
convert_files(csv_files, "cp866")
You're reopening the file during every iteration.
for line in f.readlines():
lineno +=1
if lineno == 1 :
continue
#move the following line outside of the for block
file_source = open(name, mode='w', encoding='utf-8', errors='ignore')
If all you need is to change the character encoding of the files then it doesn't matter that they are csv files unless the conversion may change what characters are interpreted as delimiter, quotechar, etc:
def convert(filename, from_encoding, to_encoding):
with open(filename, newline='', encoding=from_encoding) as file:
data = file.read().encode(to_encoding)
with open(filename, 'wb') as outfile:
outfile.write(data)
for path in csv_files:
convert(path, "cp866", "utf-8")
Add errors parameter to change how encoding/decoding errors are handled.
If files may be large then you could convert data incrementally:
import os
from shutil import copyfileobj
from tempfile import NamedTemporaryFile
def convert(filename, from_encoding, to_encoding):
with open(filename, newline='', encoding=from_encoding) as file:
with NamedTemporaryFile('w', encoding=to_encoding, newline='',
dir=os.path.dirname(filename)) as tmpfile:
copyfileobj(file, tmpfile)
tmpfile.delete = False
os.replace(tmpfile.name, filename) # rename tmpfile -> filename
for path in csv_files:
convert(path, "cp866", "utf-8")
You can do this
def convert_files(files, ascii, to="utf-8"):
for name in files:
with open(name, 'r+') as f:
data = ''.join(f.readlines())
data.decode(ascii).encode(to)
f.seek(0)
f.write(data)
f.truncate()