I need to get a certain part of my file and write it in new file. Keep the rest in a new file. So I will have 3 files . 1) Original file 2)Selected lines 3) The rest . I have a code that works for taking the first selection. I'm having problem to get the next selection and so on. Here's my code :
counter=0
with open('1','r') as file1: #open raw data
with open('2','w') as file3:
with open('3','w') as file_out:
for i in file1:
if counter <10: ############# Next I need to get line 10 to 20 followed by 20 to 30
file_out.write(i)
else:
file3.write(i)
counter += 1
How can I change my code so that I can get the next selection?
Does this make what you want?
def split_on_crosses(infile, chunk_size):
head_num = 1 # counter for chunks
head_file = open('1-head.txt', 'w') # outport to first head file
tails = [] # outports to tail files
with open(infile,'r') as inport: #open raw data
for i, line in enumerate(inport, start=1):
head_file.write(line)
for t in tails: # write to all tail files
t.write(line)
if i % chunk_size == 0: # boundary of chunk is reached
tails.append(open('%s-tail.txt' % head_num, 'w')) # add one tail file
head_num += 1
head_file = open('%s-head.txt' % head_num, 'w') # switch to next head file
split_on_crosses('infile.txt', 10)
This should do what you want, written in Python3.x.
#read file1, get the lines as an array, length of said array, and close it.
alpha=open('alpha.txt','r')
alphaLine=alpha.readlines()
alphaLength=len(alphaLine)
alpha.close()
#lines above 10 and below 20 are sent to beta, while 10 to 20 are sent to gamma.
beta=open('beta.txt','w')
gamma=open('gamma.txt','w')
for i in range(alphaLength):
if i<9:
beta.write(alphaLine[i])
elif i<20:
gamma.write(alphaLine[i])
else:
beta.write(alphaLine[i])
beta.close()
gamma.close()
For speed, I will assume the file is small enough to hold in memory (rather than re-reading the file each time):
from itertools import islice
BLOCKSZ = 10 # lines per chunk
# file names
INPUT = "raw_data.txt"
OUTPUT_LINES = lambda a, b: "data_lines_{}_to_{}.txt" .format(a, b-1)
OUTPUT_EXCEPT = lambda a, b: "data_except_{}_to_{}.txt".format(a, b-1)
def main():
# read file as list of lines
with open(INPUT) as inf:
data = list(inf)
num_blocks = (len(data) + BLOCKSZ - 1) // BLOCKSZ
for block in range(num_blocks):
# calculate start and end lines for this chunk
start = block * BLOCKSZ
end = (block + 1) * BLOCKSZ
# write out [start:end]
with open(OUTPUT_RANGE(start, end), "w") as outf:
for line in islice(data, start, end):
outf.write(line)
# write out [:start] + [end:]
with open(OUTPUT_EXCEPT(start, end), "w") as outf:
for line in islice(data, start):
outf.write(line)
for line in islice(data, end - start):
pass
for line in inf:
outf.write(line)
if __name__=="__main__":
main()
Edit: I just realized I made a mistake in my line-slicing for OUTPUT_EXCEPT (thinking of islice offsets as absolute not relative); this is now fixed.
Related
I have a text file (1 Billion lines) of 60GB size. I have to extract data corresponds to specified line numbers which can be read from another text file (eg:1, 4, 70, 100...etc). Due to the size I can't load data to memory and then extract lines. Also, line by line matching and extraction would take many days of time. Is there any solution exist for this problem?
2 methods which I tried:
1. first method
f = open('line_numbers.txt')
lines = f.readlines()
numbers =[int(e.strip()) for e in lines]
r = max(numbers)
file = open('OUTPUT_RESULT.txt','w')
with open('Large_File.txt') as infile:
for num, line in enumerate(infile,1):
if (num<= r):
if (num in numbers):
file.write(line)
else:
pass
print(num)
It will take many days to get the result
2. second method
import pandas as pd
data = pd.read_csv('Large_File.txt', header=None)
file = open('OUTPUT_RESULT.txt','w')
f = open('line_numbers.txt')
lines = f.readlines()
numbers =[int(e.strip()) for e in lines]
x = data.loc[numbers,:]
file.write(x)
It does not load file to memory
Is there any solution available to resolve this?
Your issue is probably with the if (num in numbers) line. Not only does it not need the parentheses, but it also checks this for every iteration, even though your code goes through the file in order (first line 1, then line 2, etc.).
That can be easily optimised and doing so, the code below ran in only 12 seconds on a test file of about 50 million lines. It should process your file in a few minutes.
import random
numbers = sorted([random.randint(1, 50000000) for _ in range(1000)])
outfile = open('specific_lines.txt', 'w')
with open('archive_list.txt', 'r', encoding='cp437') as infile:
for num, line in enumerate(infile, 1):
if numbers:
if num == numbers[0]:
outfile.write(line)
print(num)
del numbers[0]
else:
pass
Note: this generates a 1,000 random line numbers, replace with your loaded numbers like in your example. If your list of number is far greater, the write time for the output file will increase execution time somewhat.
Your code would be like:
with open('line_numbers.txt') as f:
lines = f.readlines()
numbers = sorted([int(e.strip()) for e in lines])
outfile = open('specific_lines.txt', 'w')
with open('archive_list.txt', 'r', encoding='cp437') as infile:
for num, line in enumerate(infile, 1):
if numbers:
if num == numbers[0]:
outfile.write(line)
print(num)
del numbers[0]
else:
pass
I know it's not completely finished, but I'm very confused as how to format the save inventory function so it prints like the original text file. During the add_item function, it shows that the item has been added to the lists. But when going to write nothing is there or updated.
Example of how the text file needs to look
def save_inventory(inventoryFile, descriptionArray, quantityArray, priceArray, intrecords):
outfile = open(inventoryFile, "w")
with open('inventory1.txt', 'r') as f:
count = -1
for line in f:
count+=1
if count % 3 == 0: #this is the remainder operator
outfile.write(descriptionArray)
print(descriptionArray)
with open('inventory1.txt', 'r') as f:
count = -2
for line in f:
count+=1
if count % 3 == 0: #this is the remainder operator
outfile.write(str(quantityArray))
print(quantityArray)
with open('inventory1.txt', 'r') as f:
count = -3
for line in f:
count+=1
if count % 3 == 0: #this is the remainder operator
outfile.write(str(priceArray))
print(priceArray)
outfile.close()
You are only writing to the file when you have read a line. If your text file is empty you will never write to the file.
What I would do is zip the lists together and loop through them. Then write three lines to the file for each pass through the loop. You can print a carriage return with '\n'
with open(inventoryFile, 'w') as f:
for d, q, p in zip(descriptionArray, quantityArray, priceArray):
f.write('%s\n%s\n%s\n' % (d, q, p))
Input file-input.txt
entry1:name
entry1:description
entry1:reference_number
---
entry2:name
entry2:description
entry2:reference_number
---
Output file-output.txt
entry1:name entry1:description entry1:reference_number ---
entry2:name entry2:description entry2:reference_number ---
Source code
def line_break_join(infilepath, n):
with open(infilepath) as infile:
for i in range(1,4):
print file.readline()
line_break_join("file1.txt", 4)
I can give break after reading 4 lines. Furthermore I want to join those 4 lines and read thru entire file and join 4 lines each and do accordingly. Any suggestion will be greatly appreciate. Thanks.
One possible way to look at this:
def line_break_join(infilepath, n):
with open(infilepath) as infile:
#Read all the lines in the file, removing the line breaks
lines = infile.read().splitlines()
#Grouping lines by pack of n
pack = [lines[i:i+n] for i in range(0, len(lines), n)]
#Joining each pack, putting a space between each string
for subpack in pack:
print " ".join(subpack)
Reading all the lines in one go will not be efficient if the file is large. Following is a possible solution:
def read_write_batch(inpath, outpath, n):
with open(inpath) as infile, open(outpath, 'w') as outfile:
batch = []
for line in infile:
batch.append(line.strip())
if len(batch) == n:
outfile.write(':'.join(batch))
outfile.write('\n')
batch = []
if __name__ == '__main__':
read_write_batch('/tmp/test.txt', '/tmp/out.txt', 4)
Here is a way to do it :
def join(lines, n):
it = iter(lines)
while True:
line = ' '.join(it.next().strip() for _ in range(n))
if line:
yield '%s\n' % line
else:
break
with open(outfile, 'w') as out:
out.writelines(join(open(infile), 4))
From my main program I am calling in a loop a script which, as an output, adds a line of data to a txt file. What is the easiest way to include also the line number?
Here is the code I am using:
if area > 1000:
f = open(output_file, "a")
f.write("%s %s\n" % (a, b))
f.close
You'll first need to count the number of lines in the file already, before adding a new line with the counter incremented:
if area > 1000:
with open(output_file, "r+") as f:
linecount = sum(1 for _ in output_file)
f.write("%s %s %s\n" % (linecount + 1, a, b))
This is the simpler approach; it reads the whole file and counts the lines. For larger files, you'd have to read a chunk at the end to find the last line, then parse the last counter from that last line instead, to avoid reading through the whole file.
You should not be opening the output file everytime. As for the counter, you can just maintain a counter yourself.
with open(input_file, 'r') as i, open(output_file, 'w') as o:
count = 1
for line in i:
# do some computation
if area > 1000:
o.write('%d: %s %s\n' %(count, a, b))
count += 1
I wrote a python script to treat text files.
The input is a file with several lines. At the beginning of each line, there is a number (1, 2, 3... , n). Then an empty line and the last line on which some text is written.
I need to read through this file to delete some lines at the beginning and some in the end (say number 1 to 5 and then number 78 to end). I want to write the remaining lines on a new file (in a new directory) and renumber the first numbers written on these lines (in my example, 6 would become 1, 7 2 etc.)
I wrote the following:
def treatFiles(oldFile,newFile,firstF, startF, lastF):
% firstF is simply an index
% startF corresponds to the first line I want to keep
% lastF corresponds to the last line I want to keep
numberFToDeleteBeginning = int(startF) - int(firstF)
with open(oldFile) as old, open(newFile, 'w') as new:
countLine = 0
for line in old:
countLine += 1
if countLine <= numberFToDeleteBeginning:
pass
elif countLine > int(lastF) - int(firstF):
pass
elif line.split(',')[0] == '\n':
newLineList = line.split(',')
new.write(line)
else:
newLineList = [str(countLine - numberFToDeleteBeginning)] + line.split(',')
del newLineList[1]
newLine = str(newLineList[0])
for k in range(1, len(newLineList)):
newLine = newLine + ',' + str(newLineList[k])
new.write(newLine)
if __name__ == '__main__':
from sys import argv
import os
os.makedirs('treatedFiles')
new = 'treatedFiles/' + argv[1]
treatFiles(argv[1], argv[2], newFile, argv[3], argv[4], argv[5])
My code works correctly but is far too slow (I have files of about 10Gb to treat and it's been running for hours).
Does anyone know how I can improve it?
I would get rid of the for loop in the middle and the expensive .split():
from itertools import islice
def treatFiles(old_file, new_file, index, start, end):
with open(old_file, 'r') as old, open(new_file, 'w') as new:
sliced_file = islice(old, start - index, end - index)
for line_number, line in enumerate(sliced_file, start=1):
number, rest = line.split(',', 1)
if number == '\n':
new.write(line)
else:
new.write(str(line_number) + ',' + rest)
Also, convert your three numerical arguments to integers before passing them into the function:
treatFiles(argv[1], argv[2], newFile, int(argv[3]), int(argv[4]), int(argv[5]))