I wrote a python script to treat text files.
The input is a file with several lines. At the beginning of each line, there is a number (1, 2, 3... , n). Then an empty line and the last line on which some text is written.
I need to read through this file to delete some lines at the beginning and some in the end (say number 1 to 5 and then number 78 to end). I want to write the remaining lines on a new file (in a new directory) and renumber the first numbers written on these lines (in my example, 6 would become 1, 7 2 etc.)
I wrote the following:
def treatFiles(oldFile,newFile,firstF, startF, lastF):
% firstF is simply an index
% startF corresponds to the first line I want to keep
% lastF corresponds to the last line I want to keep
numberFToDeleteBeginning = int(startF) - int(firstF)
with open(oldFile) as old, open(newFile, 'w') as new:
countLine = 0
for line in old:
countLine += 1
if countLine <= numberFToDeleteBeginning:
pass
elif countLine > int(lastF) - int(firstF):
pass
elif line.split(',')[0] == '\n':
newLineList = line.split(',')
new.write(line)
else:
newLineList = [str(countLine - numberFToDeleteBeginning)] + line.split(',')
del newLineList[1]
newLine = str(newLineList[0])
for k in range(1, len(newLineList)):
newLine = newLine + ',' + str(newLineList[k])
new.write(newLine)
if __name__ == '__main__':
from sys import argv
import os
os.makedirs('treatedFiles')
new = 'treatedFiles/' + argv[1]
treatFiles(argv[1], argv[2], newFile, argv[3], argv[4], argv[5])
My code works correctly but is far too slow (I have files of about 10Gb to treat and it's been running for hours).
Does anyone know how I can improve it?
I would get rid of the for loop in the middle and the expensive .split():
from itertools import islice
def treatFiles(old_file, new_file, index, start, end):
with open(old_file, 'r') as old, open(new_file, 'w') as new:
sliced_file = islice(old, start - index, end - index)
for line_number, line in enumerate(sliced_file, start=1):
number, rest = line.split(',', 1)
if number == '\n':
new.write(line)
else:
new.write(str(line_number) + ',' + rest)
Also, convert your three numerical arguments to integers before passing them into the function:
treatFiles(argv[1], argv[2], newFile, int(argv[3]), int(argv[4]), int(argv[5]))
Related
I know it's not completely finished, but I'm very confused as how to format the save inventory function so it prints like the original text file. During the add_item function, it shows that the item has been added to the lists. But when going to write nothing is there or updated.
Example of how the text file needs to look
def save_inventory(inventoryFile, descriptionArray, quantityArray, priceArray, intrecords):
outfile = open(inventoryFile, "w")
with open('inventory1.txt', 'r') as f:
count = -1
for line in f:
count+=1
if count % 3 == 0: #this is the remainder operator
outfile.write(descriptionArray)
print(descriptionArray)
with open('inventory1.txt', 'r') as f:
count = -2
for line in f:
count+=1
if count % 3 == 0: #this is the remainder operator
outfile.write(str(quantityArray))
print(quantityArray)
with open('inventory1.txt', 'r') as f:
count = -3
for line in f:
count+=1
if count % 3 == 0: #this is the remainder operator
outfile.write(str(priceArray))
print(priceArray)
outfile.close()
You are only writing to the file when you have read a line. If your text file is empty you will never write to the file.
What I would do is zip the lists together and loop through them. Then write three lines to the file for each pass through the loop. You can print a carriage return with '\n'
with open(inventoryFile, 'w') as f:
for d, q, p in zip(descriptionArray, quantityArray, priceArray):
f.write('%s\n%s\n%s\n' % (d, q, p))
I'm trying to split up a very large text file into multiple smaller ones. When I run the code below, the first created file is correct. Everything after that just contains the 'INSERT INTO ...' string and nothing else. Thanks in advance
import math
interval = 100000
with open('my-big-file','r') as c:
for i, l in enumerate(c):
pass
length = i + 1
numOfFiles = int(math.ceil(length / interval))
with open('my-big-file','r') as c:
for j in range(0, numOfFiles):
with open('my-smaller-file_{}.sql'.format(j),'w') as n:
print >> n, 'INSERT INTO codes (code, some-field, some-other-field) VALUES'
for i, line in enumerate(c):
if i >= j * interval and i < (j + 1) * interval:
line = line.rstrip()
if not line: continue
print >> n, '(%s,'something','something else'),' % (line)
else:
break
You don't need to count the number of lines before iterating the file, you can directly write to a new file whenever you reach the number of given lines:
#!/usr/bin/env python
def split(fn, num=1000, suffix="_%03d"):
import os
full, ext = os.path.splitext(fn)
with open(fn, 'r') as f:
for i, l in enumerate(f):
if i%num == 0:
try:
out.close()
except UnboundLocalError:
pass
out = open(full+suffix%(i/num)+ext, 'w')
out.write(l)
else:
out.close()
if __name__ == '__main__':
import sys
split(sys.argv[1])
You can run this from the command line. Though probably the split command is more useful, since it supports a multitude of options.
It's also possible to rewrite this code to also use with for the file(s) being written to, but that's another topic.
I was trying to extract even lines from a text file and output to a new file. But with my codes python warns me "list index out of range". Anyone can help me? THANKS~
Code:
f = open('input.txt', 'r')
i = 0
j = 0
num_lines = sum(1 for line in f)
newline = [0] * num_lines
print (num_lines)
for i in range(1, num_lines):
if i % 2 == 0:
newline[i] = f.readlines()[i]
print i, newline[i]
i = i + 1
f.close()
f = open('output.txt', 'w')
for j in range(0,num_lines):
if j % 2 == 0:
f.write(newline[j] + '\n')
j = j + 1
f.close()
Output:
17
Traceback (most recent call last):
File "./5", line 10, in <module>
a = f.readlines()[1]
IndexError: list index out of range
After
num_lines = sum(1 for line in f)
The file pointer in f is at the end of the file. Therefore any subsequent call of f.readlines() gives an empty list. The minimal fix is to use f.seek(0) to return to the start of the file.
However, a better solution would be to read through the file only once, e.g. using enumerate to get the line and its index i:
newline = []
for i, line in enumerate(f):
if i % 2 == 0:
newline.append(line)
In your original script you read the file once to scan the number of lines, then you (try to) read the lines in memory, you needlessly create a list for the full size instead of just extending it with list.append, you initialize the list with zeroes which does not make sense for a list containing strings, etc.
Thus, this script does what your original idea was, but better and simpler and faster:
with open('input.txt', 'r') as inf, open('output.txt', 'w') as outf:
for lineno, line in enumerate(inf, 1):
if lineno % 2 == 0:
outf.write(line)
Specifically
open the files with with statement so that they are automatically closed when
the block is exited.
write as they are read
as lines are numbered 1-based, use the enumerate with the start value 1 so that you truly get the even numbered lines.
You've also got the itertools.islice approach available:
from itertools import islice
with open('input') as fin, open('output', 'w') as fout:
fout.writelines(islice(fin, None, None, 2))
This saves the modulus operation and puts the line writing to system level.
I need to get a certain part of my file and write it in new file. Keep the rest in a new file. So I will have 3 files . 1) Original file 2)Selected lines 3) The rest . I have a code that works for taking the first selection. I'm having problem to get the next selection and so on. Here's my code :
counter=0
with open('1','r') as file1: #open raw data
with open('2','w') as file3:
with open('3','w') as file_out:
for i in file1:
if counter <10: ############# Next I need to get line 10 to 20 followed by 20 to 30
file_out.write(i)
else:
file3.write(i)
counter += 1
How can I change my code so that I can get the next selection?
Does this make what you want?
def split_on_crosses(infile, chunk_size):
head_num = 1 # counter for chunks
head_file = open('1-head.txt', 'w') # outport to first head file
tails = [] # outports to tail files
with open(infile,'r') as inport: #open raw data
for i, line in enumerate(inport, start=1):
head_file.write(line)
for t in tails: # write to all tail files
t.write(line)
if i % chunk_size == 0: # boundary of chunk is reached
tails.append(open('%s-tail.txt' % head_num, 'w')) # add one tail file
head_num += 1
head_file = open('%s-head.txt' % head_num, 'w') # switch to next head file
split_on_crosses('infile.txt', 10)
This should do what you want, written in Python3.x.
#read file1, get the lines as an array, length of said array, and close it.
alpha=open('alpha.txt','r')
alphaLine=alpha.readlines()
alphaLength=len(alphaLine)
alpha.close()
#lines above 10 and below 20 are sent to beta, while 10 to 20 are sent to gamma.
beta=open('beta.txt','w')
gamma=open('gamma.txt','w')
for i in range(alphaLength):
if i<9:
beta.write(alphaLine[i])
elif i<20:
gamma.write(alphaLine[i])
else:
beta.write(alphaLine[i])
beta.close()
gamma.close()
For speed, I will assume the file is small enough to hold in memory (rather than re-reading the file each time):
from itertools import islice
BLOCKSZ = 10 # lines per chunk
# file names
INPUT = "raw_data.txt"
OUTPUT_LINES = lambda a, b: "data_lines_{}_to_{}.txt" .format(a, b-1)
OUTPUT_EXCEPT = lambda a, b: "data_except_{}_to_{}.txt".format(a, b-1)
def main():
# read file as list of lines
with open(INPUT) as inf:
data = list(inf)
num_blocks = (len(data) + BLOCKSZ - 1) // BLOCKSZ
for block in range(num_blocks):
# calculate start and end lines for this chunk
start = block * BLOCKSZ
end = (block + 1) * BLOCKSZ
# write out [start:end]
with open(OUTPUT_RANGE(start, end), "w") as outf:
for line in islice(data, start, end):
outf.write(line)
# write out [:start] + [end:]
with open(OUTPUT_EXCEPT(start, end), "w") as outf:
for line in islice(data, start):
outf.write(line)
for line in islice(data, end - start):
pass
for line in inf:
outf.write(line)
if __name__=="__main__":
main()
Edit: I just realized I made a mistake in my line-slicing for OUTPUT_EXCEPT (thinking of islice offsets as absolute not relative); this is now fixed.
Like I said in the title, my script only seems to work on the first line.
Here is my script:
#!/usr/bin/python
import sys
def main():
a = sys.argv[1]
f = open(a,'r')
lines = f.readlines()
w = 0
for line in lines:
spot = 0
cp = line
for char in reversed(cp):
x = -1
if char == ' ':
del line[x]
w += 0
if char != '\n' or char != ' ':
lines[spot] = line
spot += 1
break
x += 1
f.close()
f = open(a,'w')
f.writelines(lines)
print("White Space deleted: "+str(w))
if __name__ == "__main__":
main()
I'm not too experienced when it comes to loops.
The following script do the same thing as your program, more compactly:
import fileinput
deleted = 0
for line in fileinput.input(inplace=True):
stripped = line.rstrip()
deleted += len(line) - len(stripped) + 1 # don't count the newline
print(stripped)
print("Whitespace deleted: {}".format(deleted))
Here str.rstrip() removes all whitespace from the end of a line (newlines, spaces and tabs).
The fileinput module takes care of handling sys.argv for you, opening files one by one if you name more than one file.
Using print() will add the newline back on to the end of the stripped lines.
Just use rstrip:
f = open(a,'r')
lines = f.readlines()
f.close()
f = open(a,'w')
for line in lines:
f.write(line.rstrip()+'\n')
f.close()
rstrip() is probably what you want to use to achieve this.
>>> 'Here is my string '.rstrip()
'Here is my string'
A more compact way to iterate backwards over stings is
>>> for c in 'Thing'[::-1]:
print(c)
g
n
i
h
T
[::-1] is slice notation. SLice notaion can be represented as [start:stop:step]. In my example a -1 for the step means it will step form the back by one index. [x:y:z] will start at index x stop at y-1 and go forward by z places each step.