hadoop filesystem open file and skip first line - python

I'm reading the file in my HDFS using Python language.
Each file has a header and I'm trying to merge the files. However, the header in each file also gets merged.
Is there a way to skip the header from second file?
hadoop = sc._jvm.org.apache.hadoop
conf = hadoop.conf.Configuration()
fs = hadoop.fs.FileSystem.get(conf)
src_dir = "/mnt/test/"
out_stream = fs.create(hadoop.fs.Path(dst_file), overwrite)
files = []
for f in fs.listStatus(hadoop.fs.Path(src_dir)):
if f.isFile():
files.append(f.getPath())
for file in files:
in_stream = fs.open(file)
hadoop.io.IOUtils.copyBytes(in_stream, out_stream, conf, False)
Currently I have solved the problem with below logic, however would like to know if there is any better and efficient solution? appreciate your help
for idx,file in enumerate(files):
if debug:
print("Appending file {} into {}".format(file, dst_file))
# remove header from the second file
if idx>0:
file_str = ""
with open('/'+str(file).replace(':',''),'r+') as f:
for idx,line in enumerate(f):
if idx>0:
file_str = file_str + line
with open('/'+str(file).replace(':',''), "w+") as f:
f.write(file_str)
in_stream = fs.open(file) # InputStream object and copy the stream
try:
hadoop.io.IOUtils.copyBytes(in_stream, out_stream, conf, False) # False means don't close out_stream
finally:
in_stream.close()

What you are doing now is appending repeatedly to a string. This is a fairly slow process. Why not write directly to the output file as you are reading?
for file_idx, file in enumerate(files):
with open(...) as out_f, open(...) as in_f:
for line_num, line in enumerate(in_f):
if file_idx == 0 or line_num > 0:
f_out.write(line)
If you can load the file all at once, you can also skip the first line by using readline followed by readlines:
for file_idx, file in enumerate(files):
with open(...) as out_f, open(...) as in_f:
if file_idx != 0:
f_in.readline()
f_out.writelines(f_in.readlines())

Related

How to read a line while reading a file in Python

Imagine that another filename in the same directory is inside the txt file we're currently in:
For example, let file A be the following:
B.txt
computer
science
How would it be possible to read the other lines and go into B.txt after we're done reading?
If you want to read first line separately, you can do it with readline(). Loop then proceeds to read the file from the second line to the end of file:
import os
def read_files_to_list(wordlist, file):
with open(file, "r") as f:
newfile = f.readline()
newfile = newfile.strip() # removes \n and whitespaces
if not os.path.exists(newfile):
wordlist.append(newfile)
newfile = None
for line in f:
line_clean = line.strip()
wordlist.append(line_clean)
return wordlist, newfile
next_file = "A.txt"
listofwords = []
while next_file is not None:
listofwords, next_file = read_files_to_list(listofwords, next_file)

How to read a file and print it, skipping certain lines in python

I would like to read a file line by line but ignore any that contain a colon (:).
I'm currently opening one file, reading it, and trying to print it before eventually put it into a new file.
def shoppinglist():
infile = open('filename.txt')
contents = infile.readline()
output = open('outputfilename.txt', 'w')
while ":" not in contents:
contents = infile.readline()
else:
contentstr = contents.split()
print(contentstr)
output.write(contents)
infile.close()
output.close()
As it is, one line is repeated over and over and over.
Try:
def shoppinglist():
contents = ""
with open('filename.txt', 'r') as infile:
for line in infile.readlines():
if ":" not in line:
contents += line
with open('outputfilename.txt', 'w') as output_file:
output_file.write(contents)

Text to Excel in Python overwrites existing data

I have a small script, and I am trying to parse data from text file to Excel file instead of going till the counter which is till 166 it stops at 134 and then doesn't do anything.
I have a file close operation also but it doesn't close the file and looks like the script continues to run.
Any thoughts? What am I doing wrong ?
path = ('C:\\Users\\40081\\PycharmProjects\\abcd')
#file_name = open('parsed_DUT1.txt', 'r')
file_name = 'parsed_DUT1.txt'
count=1
for line in file_name:
inputfile = open(file_name)
outputfile = open("parsed_DUT1" + '.xls', 'w+')
while count < 166:
for line in inputfile:
text = "TestNum_" + str(count*1)
if text in line :
#data = text[text.find(" ")+1:].split()[0]
outputfile.writelines(line)
count = count+1
inputfile.close()
outputfile.close()
w+
Opens a file for both writing and reading. Overwrites the existing
file if the file exists. If the file does not exist, creates a new
file for reading and writing.
You are opening the output file in w+ mode, that overwrites it everytime. Try with
outputfile = open("parsed_DUT1" + '.xls', 'a') # 'a' opens a file for appending.
I also suggest you to deal with files with with statement:
with open(file_name) as inputfile, open("parsed_DUT1" + '.xls', 'a') as outputfile:
# do stuff with input and output files

How to parse a concatenated a zone file

I have a large zone file that I want to move to another provider. My issue is the export is only one large concatenated zone file where as my new registrar only accepts single standard zone files.
For example allzone.txt contains:
somedomain.com
=========
Record data...
...
------------
anotherdomain.com
=========
Record data...
...
------------
evenmoredomain.com
=========
Record data...
...
------------
What I'd like to happen is that it takes the one file above and creates 3 files.
somedomain.txt
anotherdomain.com.txt
evenmoredomain.com.txt
Inside each of the files the delimiters of :
anydomain.com
=========
and
------------
are removed only leaving
"Record data"
Between.
So a file should be named domainA.com.txt and inside just the corresponding record data.
Not sure what the best way to do this. I can split on a delimiter but not sure how to take that content to write a new file where the name is what is before the delimiter (anydomain.com)
Thanks!
More or less
current_file = None
with open('allzone.txt') as f:
# read line by line
for line in f:
# open next file and close previous
if line.startswith('domain'):
# close previous file
if current_file:
current_file.close()
# open new file
current_file = open(line.strip() + '.txt', 'w')
# write to current file
if current_file:
if not (line.startswith('domain') or line.startswith('---') or line.startswith('===')):
current_file.write(line)
# close last file
if current_file:
current_file.close()
EDIT: new version for any domain
current_file = None
with open('allzone.txt') as f:
# read line by line
for line in f:
# open next file
if not current_file:
# open new file
current_file = open(line.strip() + '.txt', 'w')
# skip next line
next(f)
else:
# close previous file
if line.startswith('---') :
current_file.close()
current_file = None
# write records
#elif not line.startswith('==='): # use it if you don't use `next(f)`
else:
current_file.write(line)
# close last file
if current_file:
current_file.close()
Maybe something like this would work? It might still need some tweaking
def main():
with open('allzone.txt', 'r+') as f:
data = ''
first_line = ''
for line in f:
if first_line == '':
first_line = line
elif line == '------------\n':
new_file = open('%s.txt' % first_line.rstrip(), 'w+')
new_file.write(data)
new_file.close()
first_line = ''
data = ''
elif line == '=========\n' or line == '...\n' or line == '------------\n':
pass
else:
data += line
if __name__ == '__main__':
main()

Read many csv file and write it to encoding to utf8 using python

I'm using python code to read from many csv files and set encoding to utf8.I meet the problem when I read the file I can read all lines but when I write it, it can write only 1 line. Please help me to check my code as below:
def convert_files(files, ascii, to="utf-8"):
for name in files:
#print ("Convert {0} from {1} to {2}").format(name, ascii, to)
with open(name) as f:
print(name)
count = 0
lineno = 0
#this point I want to write the below text into my each new file at the first line
#file_source.write('id;nom;prenom;nom_pere;nom_mere;prenom_pere;prenom_mere;civilite (1=homme 2=f);date_naissance;arrondissement;adresse;ville;code_postal;pays;telephone;email;civilite_demandeur (1=homme 2=f);nom_demandeur;prenom_demandeur;qualite_demandeur;type_acte;nombre_actes\n')
for line in f.readlines():
lineno +=1
if lineno == 1 :
continue
file_source = open(name, mode='w', encoding='utf-8', errors='ignore')
#pass
#print (line)
# start write data to to new file with encode
file_source.write(line)
#file_source.close
#print unicode(line, "cp866").encode("utf-8")
csv_files = find_csv_filenames('./csv', ".csv")
convert_files(csv_files, "cp866")
You're reopening the file during every iteration.
for line in f.readlines():
lineno +=1
if lineno == 1 :
continue
#move the following line outside of the for block
file_source = open(name, mode='w', encoding='utf-8', errors='ignore')
If all you need is to change the character encoding of the files then it doesn't matter that they are csv files unless the conversion may change what characters are interpreted as delimiter, quotechar, etc:
def convert(filename, from_encoding, to_encoding):
with open(filename, newline='', encoding=from_encoding) as file:
data = file.read().encode(to_encoding)
with open(filename, 'wb') as outfile:
outfile.write(data)
for path in csv_files:
convert(path, "cp866", "utf-8")
Add errors parameter to change how encoding/decoding errors are handled.
If files may be large then you could convert data incrementally:
import os
from shutil import copyfileobj
from tempfile import NamedTemporaryFile
def convert(filename, from_encoding, to_encoding):
with open(filename, newline='', encoding=from_encoding) as file:
with NamedTemporaryFile('w', encoding=to_encoding, newline='',
dir=os.path.dirname(filename)) as tmpfile:
copyfileobj(file, tmpfile)
tmpfile.delete = False
os.replace(tmpfile.name, filename) # rename tmpfile -> filename
for path in csv_files:
convert(path, "cp866", "utf-8")
You can do this
def convert_files(files, ascii, to="utf-8"):
for name in files:
with open(name, 'r+') as f:
data = ''.join(f.readlines())
data.decode(ascii).encode(to)
f.seek(0)
f.write(data)
f.truncate()

Categories