regarding re-factoring python code - python

so I have this python file which looks out for all the "label" tags in a XML file and does some modification with it. label is a string containing at max three lines. the code is manipulating XML file.
#1 label="Number of Packets Transmitted by the Source
Node of the Path to the Destination Node Of
the Path"
#2 label="Number of Packets Transmitted by the Source
node of the path to the destination node of
the path"
notice in label #2 words in second and third line are not in upper case which is not what I want. I want help in correcting logic of my program such that I should not write label twice.
import os
from io import StringIO, BytesIO
def splitAndMakeTitleCase(line):
# does something not relevant to context
fileList = open("AllFiles")
for fileStr in fileList:
fileName = fileStr.rstrip('\n')
openFile = open(fileName)
openNewFile = open(fileName+'TitleCase.xml','w')
lines = openFile.readlines()
for lineIndex in range(0,len(lines)):
line = lines[lineIndex]
skip = 0
if "label=" in line and "const" not in line:
segs = line.split('"')
if len(segs) >= 3:
pass
else:
openNewFile.write(lines[lineIndex])
secondTitleCaseLine = splitAndMakeTitleCase(lines[lineIndex + 1])
skip = lineIndex + 1
openNewFile.write(secondTitleCaseLine)
if '"' not in lines[lineIndex + 1]:
thirdTitleCaseLine = splitAndMakeTitleCase(lines[lineIndex + 2])
skip = lineIndex + 1
openNewFile.write(thirdTitleCaseLine)
openNewFile.write(lines[lineIndex])
openFile.close()
openNewFile.close()
#cmd = "mv " + fileName + "TitleCase.xml " + fileName
#os.system(cmd)

In your for loop you have the first if and then within that you do some printing to the file. Then after that you do another print of the line to the file. I think that you probably want that last line in a else like this:
for fileStr in fileList:
fileName = fileStr.rstrip('\n')
openFile = open(fileName)
openNewFile = open(fileName+'TitleCase.xml','w')
lines = openFile.readlines()
for lineIndex in range(0,len(lines)):
line = lines[lineIndex]
skip = 0
if "label=" in line and "const" not in line:
segs = line.split('"')
if len(segs) >= 3:
pass
else:
openNewFile.write(lines[lineIndex])
secondTitleCaseLine = splitAndMakeTitleCase(lines[lineIndex + 1])
skip = lineIndex + 1
openNewFile.write(secondTitleCaseLine)
if '"' not in lines[lineIndex + 1]:
thirdTitleCaseLine = splitAndMakeTitleCase(lines[lineIndex + 2])
skip = lineIndex + 1
openNewFile.write(thirdTitleCaseLine)
else:
openNewFile.write(lines[lineIndex])
openFile.close()
openNewFile.close()
#cmd = "mv " + fileName + "TitleCase.xml " + fileName
#os.system(cmd)

Related

Lines missing in python

I am writing a code in python where I am removing all the text after a specific word but in output lines are missing. I have a text file in unicode which have 3 lines:
my name is test1
my name is
my name is test 2
What I want is to remove text after word "test" so I could get the output as below
my name is test
my name is
my name is test
I have written a code but it does the task but also removes the second line "my name is"
My code is below
txt = ""
with open(r"test.txt", 'r') as fp:
for line in fp.readlines():
splitStr = "test"
index = line.find(splitStr)
if index > 0:
txt += line[:index + len(splitStr)] + "\n"
with open(r"test.txt", "w") as fp:
fp.write(txt)
It looks like if there is no keyword found the index become -1.
So you are avoiding the lines w/o keyword.
I would modify your if by adding the condition as follows:
txt = ""
with open(r"test.txt", 'r') as fp:
for line in fp.readlines():
splitStr = "test"
index = line.find(splitStr)
if index > 0:
txt += line[:index + len(splitStr)] + "\n"
elif index < 0:
txt += line
with open(r"test.txt", "w") as fp:
fp.write(txt)
No need to add \n because the line already contains it.
Your code does not append the line if the splitStr is not defined.
txt = ""
with open(r"test.txt", 'r') as fp:
for line in fp.readlines():
splitStr = "test"
index = line.find(splitStr)
if index != -1:
txt += line[:index + len(splitStr)] + "\n"
else:
txt += line
with open(r"test.txt", "w") as fp:
fp.write(txt)
In my solution I simulate the input file via io.StringIO. Compared to your code my solution remove the else branch and only use one += operater. Also splitStr is set only one time and not on each iteration. This makes the code more clear and reduces possible errore sources.
import io
# simulates a file for this example
the_file = io.StringIO("""my name is test1
my name is
my name is test 2""")
txt = ""
splitStr = "test"
with the_file as fp:
# each line
for line in fp.readlines():
# cut somoething?
if splitStr in line:
# find index
index = line.find(splitStr)
# cut after 'splitStr' and add newline
line = line[:index + len(splitStr)] + "\n"
# append line to output
txt += line
print(txt)
When handling with files in Python 3 it is recommended to use pathlib for that like this.
import pathlib
file_path = pathlib.Path("test.txt")
# read from wile
with file_path.open('r') as fp:
# do something
# write back to the file
with file_path.open('w') as fp:
# do something
Suggestion:
for line in fp.readlines():
i = line.find('test')
if i != -1:
line = line[:i]

Increment file name while writing file in Python

My code works and increments filename but only for two first files, after that it creates new strings in existing second file. Please help me upgrade code to increment go further.
text = 'some text'
file_path = '/path/to/file'
filename = 'textfile'
i = 1
txtfile = self.file_path + filename + str(i) + '.txt'
if not os.path.exists(txtfile):
text_file = open(txtfile, "a")
text_file.write(self.text)
text_file.close()
elif os.path.exists(txtfile) and i >= 1:
i += 1
text_file1 = open(self.file_path + filename + str(i) + '.txt', "a")
text_file1.write(self.text)
text_file1.close()
If your example is part of a loop, your resetting i to 1 in every iteration. Put the i=1 outside of this part.
And it will also start at 1 when you restart your program - sometimes not what you want.

python: read file and split the data

I have a file config and the contents are separated by space " "
cat config
/home/user1 *.log,*.txt 30
/home/user2 *.trm,*.doc,*.jpeg 10
I want to read this file,parse each line and print each field from the each line.
Ex:-
Dir = /home/user1
Fileext = *.log,*.txt
days=30
I couldn't go further than the below..
def dir():
file = open('config','r+')
cont = file.readlines()
print "file contents are %s" % cont
for i in range(len(cont)):
j = cont[i].split(' ')
dir()
Any pointers how to move further?
Your code is fine, you are just missing the last step processing each element of the splitted string, try this:
def dir():
file = open('config','r+')
cont = file.readlines()
print "file contents are %s" % cont + '\n'
elements = []
for i in range(len(cont)):
rowElems = cont[i].split(' ')
elements.append({ 'dir' : rowElems[0], 'ext' : rowElems[1], 'days' : rowElems[2] })
for e in elements:
print "Dir = " + e['dir']
print "Fileext = " + e['ext']
print "days = " + e['days']
dir()
At the end of this code, you will have all the rows processed and stored in an array of dictionaries you can easily access later.
You can write a custom function to parse each line, and then use the map function to apply that function against each line in file.readlines():
def parseLine(line):
# function to split and parse each line,
# and return the formatted string
Dir, FileExt, Days = line.split(' ')[:3]
return 'Dir = {}\nFileext = {}\nDays = {}'.format(Dir, FileExt, Days)
def dir():
with open('config','r+') as file:
print 'file contents are\n' + '\n'.join(map(parseLine, file.readlines()))
Results:
>>> dir()
file contents are
Dir = /home/user1
Fileext = *.log,*.txt
Days = 30
Dir = /home/user2
Fileext = *.trm,*.doc,*.jpeg
Days = 10

Pick parts from a txt file and copy to another file with python

I'm in trouble here. I need to read a file. Txt file that contains a sequence of records, check the records that I want to copy them to a new file.
The file content is like this (this is just an example, the original file has more than 30 000 lines):
AAAAA|12|120 #begin file
00000|46|150 #begin register
03000|TO|460
99999|35|436 #end register
00000|46|316 #begin register
03000|SP|467
99999|33|130 #end register
00000|46|778 #begin register
03000|TO|478
99999|33|457 #end register
ZZZZZ|15|111 #end file
The records that begin with 03000 and have the characters 'TO' must be written to a new file. Based on the example, the file should look like this:
AAAAA|12|120 #begin file
00000|46|150 #begin register
03000|TO|460
99999|35|436 #end register
00000|46|778 #begin register
03000|TO|478
99999|33|457 #end register
ZZZZZ|15|111 #end file
Code:
file = open("file.txt",'r')
newFile = open("newFile.txt","w")
content = file.read()
file.close()
# here I need to check if the record exists 03000 characters 'TO', if it exists, copy the recordset 00000-99999 for the new file.
I did multiple searches and found nothing to help me.
Thank you!
with open("file.txt",'r') as inFile, open("newFile.txt","w") as outFile:
outFile.writelines(line for line in inFile
if line.startswith("03000") and "TO" in line)
If you need the previous and the next line, then you have to iterate inFile in triads. First define:
def gen_triad(lines, prev=None):
after = current = next(lines)
for after in lines:
yield prev, current, after
prev, current = current, after
And then do like before:
outFile.writelines(''.join(triad) for triad in gen_triad(inFile)
if triad[1].startswith("03000") and "TO" in triad[1])
import re
pat = ('^00000\|\d+\|\d+.*\n'
'^03000\|TO\|\d+.*\n'
'^99999\|\d+\|\d+.*\n'
'|'
'^AAAAA\|\d+\|\d+.*\n'
'|'
'^ZZZZZ\|\d+\|\d+.*')
rag = re.compile(pat,re.MULTILINE)
with open('fifi.txt','r') as f,\
open('newfifi.txt','w') as g:
g.write(''.join(rag.findall(f.read())))
For files with additional lines between lines beginning with 00000, 03000 and 99999, I didn't find simpler code than this one:
import re
pat = ('(^00000\|\d+\|\d+.*\n'
'(?:.*\n)+?'
'^99999\|\d+\|\d+.*\n)'
'|'
'(^AAAAA\|\d+\|\d+.*\n'
'|'
'^ZZZZZ\|\d+\|\d+.*)')
rag = re.compile(pat,re.MULTILINE)
pit = ('^00000\|.+?^03000\|TO\|\d+.+?^99999\|')
rig = re.compile(pit,re.DOTALL|re.MULTILINE)
def yi(text):
for g1,g2 in rag.findall(text):
if g2:
yield g2
elif rig.match(g1):
yield g1
with open('fifi.txt','r') as f,\
open('newfifi.txt','w') as g:
g.write(''.join(yi(f.read())))
file = open("file.txt",'r')
newFile = open("newFile.txt","w")
content = file.readlines()
file.close()
newFile.writelines(filter(lambda x:x.startswith("03000") and "TO" in x,content))
This seems to work. The other answers seem to only be writing out records that contain '03000|TO|' but you have to write out the record before and after that as well.
import sys
# ---------------------------------------------------------------
# ---------------------------------------------------------------
# import file
file_name = sys.argv[1]
file_path = 'C:\\DATA_SAVE\\pick_parts\\' + file_name
file = open(file_path,"r")
# ---------------------------------------------------------------
# create output files
output_file_path = 'C:\\DATA_SAVE\\pick_parts\\' + file_name + '.out'
output_file = open(output_file_path,"w")
# create output files
# ---------------------------------------------------------------
# process file
temp = ''
temp_out = ''
good_write = False
bad_write = False
for line in file:
if line[:5] == 'AAAAA':
temp_out += line
elif line[:5] == 'ZZZZZ':
temp_out += line
elif good_write:
temp += line
temp_out += temp
temp = ''
good_write = False
elif bad_write:
bad_write = False
temp = ''
elif line[:5] == '03000':
if line[6:8] != 'TO':
temp = ''
bad_write = True
else:
good_write = True
temp += line
temp_out += temp
temp = ''
else:
temp += line
output_file.write(temp_out)
output_file.close()
file.close()
Output:
AAAAA|12|120 #begin file
00000|46|150 #begin register
03000|TO|460
99999|35|436 #end register
00000|46|778 #begin register
03000|TO|478
99999|33|457 #end register
ZZZZZ|15|111 #end file
Does it have to be python? These shell commands would do the same thing in a pinch.
head -1 inputfile.txt > outputfile.txt
grep -C 1 "03000|TO" inputfile.txt >> outputfile.txt
tail -1 inputfile.txt >> outputfile.txt
# Whenever I have to parse text files I prefer to use regular expressions
# You can also customize the matching criteria if you want to
import re
what_is_being_searched = re.compile("^03000.*TO")
# don't use "file" as a variable name since it is (was?) a builtin
# function
with open("file.txt", "r") as source_file, open("newFile.txt", "w") as destination_file:
for this_line in source_file:
if what_is_being_searched.match(this_line):
destination_file.write(this_line)
and for those who prefer a more compact representation:
import re
with open("file.txt", "r") as source_file, open("newFile.txt", "w") as destination_file:
destination_file.writelines(this_line for this_line in source_file
if re.match("^03000.*TO", this_line))
code:
fileName = '1'
fil = open(fileName,'r')
import string
##step 1: parse the file.
parsedFile = []
for i in fil:
##tuple1 = (1,2,3)
firstPipe = i.find('|')
secondPipe = i.find('|',firstPipe+1)
tuple1 = (i[:firstPipe],\
i[firstPipe+1:secondPipe],\
i[secondPipe+1:i.find('\n')])
parsedFile.append(tuple1)
fil.close()
##search criterias:
searchFirst = '03000'
searchString = 'TO' ##can be changed if and when required
##step 2: used the parsed contents to write the new file
filout = open('newFile','w')
stringToWrite = parsedFile[0][0] + '|' + parsedFile[0][1] + '|' + parsedFile[0][2] + '\n'
filout.write(stringToWrite) ##to write the first entry
for i in range(1,len(parsedFile)):
if parsedFile[i][1] == searchString and parsedFile[i][0] == searchFirst:
for j in range(-1,2,1):
stringToWrite = parsedFile[i+j][0] + '|' + parsedFile[i+j][1] + '|' + parsedFile[i+j][2] + '\n'
filout.write(stringToWrite)
stringToWrite = parsedFile[-1][0] + '|' + parsedFile[-1][1] + '|' + parsedFile[-1][2] + '\n'
filout.write(stringToWrite) ##to write the first entry
filout.close()
I know that this solution may be a bit long. But it is quite easy to understand. And it seems an intuitive way to do it. And I have already checked this with the Data that you have provided and it works perfectly.
Please tell me if you need some more explanation on the code. I will definitely add the same.
I tip (Beasley and Joran elyase) very interesting, but it only allows to get the contents of the line 03000. I would like to get the contents of the lines 00000 to line 99999.
I even managed to do here, but I am not satisfied, I wanted to make a more cleaner.
See how I did:
file = open(url,'r')
newFile = open("newFile.txt",'w')
lines = file.readlines()
file.close()
i = 0
lineTemp = []
for line in lines:
lineTemp.append(line)
if line[0:5] == '03000':
state = line[21:23]
if line[0:5] == '99999':
if state == 'TO':
newFile.writelines(lineTemp)
else:
linhaTemp = []
i = i+1
newFile.close()
Suggestions...
Thanks to all!

Python - file management and processing multiple zip files

I have some zip files in a folder. I have a script to process them. The data that is to be written to a database is in a different file and its structure is as follows:
some_text;database;file_name
some_text2;database2;file_name2
....
What is the best way to process this file? Also, an error message should be reported if there is no matching zip file name in that file.
My current code:
filelist = glob.glob(os.path.join(rootdir, '*.zip'))
if filelist:
for file in filelist:
print "Working on file ", file
#get only file name without .zip for compare
aa = file.split(sl)
bb = aa[len(aa) -1]
cc = bb.split(".")
ime_sole = cc[0]
fle = codecs.open(rootdir + sl + 'portal_schools.txt',
'r',encoding="cp1250")
line = fle.readline()
# Read lines
for line in iter(fle):
#print line,
a,b,c = line.split(";")
if c == ime_sole:
print c
database = str(b)
#distdir = str(c)
else:
print "some text"
return
fle.close()
But this fails because it is being read line by line. If in the first line there is no match, the code stops. I need it to continue trough the file and then, after all is done, start with a new zip file.
I know my code is far from perfect. The problem was with else. I moved it to the end of the whole code. It was a novice mistake. I also inserted try-catch so if it fails on one zip file, the next one is still processed. Now, it looks something like this:
filelist = glob.glob(os.path.join(rootdir, '*.zip'))
if filelist:
for file in filelist:
try:
aa = file.split(sl)
#print "aa ",aa
bb = aa[len(aa) -1]
#print "bb ", bb
cc = bb.split(".")
#print "cc ", cc
ime_sole = cc[0]
#print "imesole ", ime_sole
fle = codecs.open(rootdir + sl + 'portal_schools.txt','r',encoding="cp1250")
#line = fle.readline()
data = []
for line in iter(fle):
line = line.replace("\r\n", "")
x = line.split(";")
data.append(x)
result = [element for element in data if element[2] == ime_sole]
fle.close()
#print result
if result:
database = result[0][1]
vnos_data = "Podatki za %s , se vpisujejo v bazo %s " % (ime_sole, database)
host ="####"
user="####"
password = "####"
iUrnik_tables = iUrnik_tables_fromzip.Tables(defdir,file,sl,host,database,user,password)
id_skripte =iUrnik_tables[0]
date_begin = iUrnik_tables[1]
date_end = iUrnik_tables[2]
iUrnik_all_fromzip.FileWork(defdir,file,sl,host,database,user,password)
iUrnik_itt_zip.Proces(defdir,file,sl,host,database,user,password,id_skripte,date_begin,date_end)
trenutek = datetime.datetime.now()
trenutek = trenutek.strftime("%Y%m%d%H%M")
newfilename = os.path.splitext(file)[0]
newfilename = newfilename +"_" + str(trenutek) + os.path.splitext(file)[1]
folder = defdir + sl + ime_sole + sl + "archive"
destination = folder + sl
novoimezipa= destination + newfilename.split(sl)[-1]
if not os.path.exists(folder):
os.makedirs(folder)
os.chdir(folder)
shutil.copy(file,destination)
old = destination + file.split(sl)[-1]
os.rename(old , novoimezipa )
os.remove(file)
else:
nothing :)
#return
except:
print sys.exc_info()
else:
vnos_nodata= u"V mapi %s ni podatkov za prenos" % (rootdir)
Logging(defdir, sl, vnos_nodata)
I know it is not perfect but it works :)

Categories