How to implement somethig like the 'head' and 'tail' commands in python and backward read by lines of a text file?
This is my personal file class ;-)
class File(file):
""" An helper class for file reading """
def __init__(self, *args, **kwargs):
super(File, self).__init__(*args, **kwargs)
self.BLOCKSIZE = 4096
def head(self, lines_2find=1):
self.seek(0) #Rewind file
return [super(File, self).next() for x in xrange(lines_2find)]
def tail(self, lines_2find=1):
self.seek(0, 2) #Go to end of file
bytes_in_file = self.tell()
lines_found, total_bytes_scanned = 0, 0
while (lines_2find + 1 > lines_found and
bytes_in_file > total_bytes_scanned):
byte_block = min(
self.BLOCKSIZE,
bytes_in_file - total_bytes_scanned)
self.seek( -(byte_block + total_bytes_scanned), 2)
total_bytes_scanned += byte_block
lines_found += self.read(self.BLOCKSIZE).count('\n')
self.seek(-total_bytes_scanned, 2)
line_list = list(self.readlines())
return line_list[-lines_2find:]
def backward(self):
self.seek(0, 2) #Go to end of file
blocksize = self.BLOCKSIZE
last_row = ''
while self.tell() != 0:
try:
self.seek(-blocksize, 1)
except IOError:
blocksize = self.tell()
self.seek(-blocksize, 1)
block = self.read(blocksize)
self.seek(-blocksize, 1)
rows = block.split('\n')
rows[-1] = rows[-1] + last_row
while rows:
last_row = rows.pop(-1)
if rows and last_row:
yield last_row
yield last_row
Example usage:
with File('file.name') as f:
print f.head(5)
print f.tail(5)
for row in f.backward():
print row
head is easy:
from itertools import islice
with open("file") as f:
for line in islice(f, n):
print line
tail is harder if you don't want to keep the whole file in memory. If the input is a file, you could start reading blocks beginning at the end of the file. The original tail also works if the input is a pipe, so a more general solution is to read and discard the whole input, except for the last few lines. An easy way to do this is collections.deque:
from collections import deque
with open("file") as f:
for line in deque(f, maxlen=n):
print line
In both these code snippets, n is the number of lines to print.
Tail:
def tail(fname, lines):
"""Read last N lines from file fname."""
f = open(fname, 'r')
BUFSIZ = 1024
f.seek(0, os.SEEK_END)
fsize = f.tell()
block = -1
data = ""
exit = False
while not exit:
step = (block * BUFSIZ)
if abs(step) >= fsize:
f.seek(0)
exit = True
else:
f.seek(step, os.SEEK_END)
data = f.read().strip()
if data.count('\n') >= lines:
break
else:
block -= 1
return data.splitlines()[-lines:]
Related
I have a python script that is checking data from a text file and writing it if it meets the right parameters. So far I have:
# -*- coding: utf-8 -*-
import math
f = open("COLLISON.txt", 'r')
linelist = f.readlines()
f.close
f2 = open("All_Collisions_Raw_Data.txt", "w")
for line in linelist:
if 'Û' in line[0]:
f2.write(line)
f2.close()
f3 = open("Primary_Recoils_Raw_Data.txt", "w")
for line in linelist:
if 'Prime Recoil' in line:
f3.write(line)
f3.close()
S = raw_input('Are you analysing a sphere?\n Y/n \n')
if S == 'Y' or S == 'y':
rad = input('What is the radius of the sphere in Angstroms? \n')
f14 = open('All_Collisions_in_sphere', 'w')
for line in linelist:
if len(line) >55:
if 'Û' in line[0]:
Xa = float(''.join(line[25:29]))
Xs = float((Xa - rad))
Ya = float(''.join(line[36:40]))
Za = float(''.join(line[47:51]))
Xf = float(''.join(line[31:34]))
Yf = float(''.join(line[42:45]))
Zf = float(''.join(line[53:56]))
Xf1 = float(10**Xf)
Yf1 = float(10**Yf)
Zf1 = float(10**Zf)
Xd = float((Xs*Xf1))
Yd = float((Ya*Yf1))
Zd = float((Za*Zf1))
Xb = float((Xd*Xd))
Yb = float((Yd*Yd))
Zb = float((Zd*Zd))
ra = float(Xb + Yb + Zb)
r = float(math.sqrt(ra))
I = (line[6])
if r < rad:
f14.write(line)
f14.close()
I only want to write if I = 1 or is equal to the previous lines I + 1. However I'm unsure how to call the previous line, or keep the current line for future recall. Does anyone know how i can achieve this?
One way is to just store the previous (we initialise to None and check if it is None):
prev = None
for line in file:
if prev is not None:
if line == prev:
# do stuff
prev = line
Another way is to user iterators
itr = iter(file)
prev = next(itr)
for line in itr:
if line == prev:
# do stuff
prev = line
Edit
If you want to get each line number as well, use the enumerate function:
for line_number, line in enumerate(file, start=1):
...
Just as an FYI don't do
file = open(path)
linelist = file.readlines()
file.close()
for line in linelist:
...
but instead do this:
with open(path) as file:
for line in file:
...
The reason is that the first method reads the entire file into memory and will not close the file if an exception happens in the read, which could corrupt the file. the with statement handles that all for you, and then you can iterate over the file directly.
The .txt file holding the data is as follows (source: "datingTestSet2.txt" in Ch.2 here):
40920 8.326976 0.953952 largeDoses
14488 7.153469 1.673904 smallDoses
26052 1.441871 0.805124 didntLike
75136 13.147394 0.428964 didntLike
38344 1.669788 0.134296 didntLike
...
Code:
from numpy import *
import operator
from os import listdir
def file2matrix(filename):
fr = open(filename)
# arr = fr.readlines() # Code1!!!!!!!!!!!!!!!!!!!
numberOfLines = len(fr.readlines()) #get the number of lines in the file
returnMat = zeros((numberOfLines,3)) #prepare matrix to return
classLabelVector = [] #prepare labels return
fr = open(filename) # Code2!!!!!!!!!!!!!!!!!!!!!
index = 0
for line in fr.readlines():
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
The result of this function is:
datingDataMat datingLabels
40920 8.326976 0.953952 3
14488 7.153469 1.673904 2
26052 1.441871 0.805124 1
75136 13.147394 0.428964 1
38344 1.669788 0.134296 1
72993 10.141740 1.032955 1
35948 6.830792 1.213192 3
42666 13.276369 0.543880 3
67497 8.631577 0.749278 1
35483 12.273169 1.508053 3
50242 3.723498 0.831917 1
... ... ... ...
My questions are:
When I just remove the Code2(fr = open(filename) which above the index = 0),
the result of the function becomes all zeros matrix, and all zeros vector.
Why can't I remove the Code2? Doesn't the first line(fr = open(filename) work?
When I just add the Code1(arr = fr.readlines()), it is wrong. Why???
returnMat[index,:] = listFromLine[0:3]
IndexError: index 0 is out of bounds for axis 0 with size 0
1) You can't remove the Code2 line because of this line:
numberOfLines = len(fr.readlines()) #get the number of lines in the file
In that line you are reading to the end of the file. Opening it again puts you at the start of the file...
2) Similar to the answer above, if you do a call to readLines() that reads all the lines and moves the file cursor to the end of the file... So if you then try to readlines on the file again, there is nothing to read, hence it fails.
You are at the end of the file. Therefore, your second attempt to read the file content yields nothings. You need to go back to beginning of the file. Use:
fr.seek(0)
Instead of your:
fr = open(filename) # Code2!!!!!!!!!!!!!!!!!!!!!
You only need to readlines once.
def file2matrix(filename):
fr = open(filename)
lines = fr.readlines()
fr.close()
numberOfLines = len(lines) #get the number of lines in the file
returnMat = zeros((numberOfLines,3)) #prepare matrix to return
classLabelVector = [] #prepare labels return
index = 0
for line in lines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
# careful here, returnMat is initialed as floats
# listFromLine is list of strings
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
I can suggest a few other changes:
def file2matrix(filename):
with open(filename) as f:
lines = f.readlines()
returnList = []
classLabelList = []
for line in lines:
listFromLine = line.strip().split('\t')
returnList.append(listFromLine[0:3])
classLabelList.append(int(listFromLine[-1]))
returnMat = np.array(returnList, dtype=float)
return returnMat, classLabelList
or even
def file2matrix(filename):
with open(filename) as f:
lines = f.readlines()
ll = [line.strip().split('\t')]
returnMat = np.array([l[0:3] for l in ll], dtype=float)
classLabelList = [int(l[-1]) for l in ll]
# classLabelVec = np.array([l[-1] for l in ll], dtype=int)
return returnMat, classLabelList
I am a python newbie. I am trying to run this simple python example. I am wish to pass files and certain values as parameter to my function latcalc(). Could anyone suggest how I can pass my files and values as parameters. Or is there any better way/approach to do these things.
#!/usr/bin/python
# include the constants
min_length = 1
max_length = 30
# delays
delay = 100
# Speed of light
c_vaccum = 3e8
global filename1
global filename2
global filename3
def openfiles():
filename1 = open("file1.txt", "w")
filename2 = open("file2.txt", "w")
filename3 = open("file3.txt", "w")
def latcalc(filename,target_name,vf):
target_name = 0
for length in range(min_length, max_length):
if length < 2:
target_name += (length/(vf * c_vaccum))
elif length == 2:
target_name += delay
else:
target_name = target_name
myline="%s\t%s\n" % (length, target_name)
filename.write(myline)
openfiles()
latcalc(filename1,lat40,0.4)
latcalc(filename2,lat80,0.8)
latcalc(filename3,lat100,1)
I would create a little class (give it a useful name) to encapsulate your data.
If your files grow you only have to change your create_lats
min_length = 1
max_length = 30
# delays
delay = 100
# Speed of light
c_vaccum = 3e8
#Little class to keep our data in one place
class Lat:
def __init__(self, filename, factor):
self.filename = filename
self.factor = factor
self.file = open(filename, "w") #let the class open the file
#now our function needs only one parameter, neat!
def latcalc(lat):
target_name = 0
for length in range(min_length, max_length):
if length < 2:
target_name += (length / (lat.factor * c_vaccum)) #acces the class variable
elif length == 2:
target_name += delay
else:
target_name = target_name
myline = "%s\t%s\n" % (length, target_name)
lat.file.write(myline)
def create_lats():
lats = []
lats.append(Lat("file1.txt", 0.4))
lats.append(Lat("file2.txt", 0.8))
lats.append(Lat("file3.txt", 1))
return lats
#loop over your lats created in create_lats
for lat in create_lats():
latcalc(lat)
lat.file.close() #close the file
try something like this (notice the globals are gone):
def openfiles(namelist):
ret = []
for name in filelist:
fi = open(name, 'w')
ret.append(fi)
return ret
filelist = ['file1.txt', 'file2.txt', 'file3.txt']
handles = openfiles(filelist)
for handle in handles:
<do what ever you want>
handles will be a list of file handles corresponding to the filelist of names
note the file handle is what you pass around to do reads & writes with
also the opens could be done in the call to latcalc, since you would be doing one file per call apparently
As some comments point out, you don't need global variables and you should close your filehandler objects after you finished writing to them which is most conveniently done with 'with' (closing is done for you, even in case of an unexpected exception):
#!/usr/bin/python
min_length = 1
max_length = 3
delay = 100
c_vaccum = 3e8
def latcalc(filename, vf):
target_name = 0
for length in range(min_length, max_length):
if length < 2:
target_name += (length/(vf * c_vaccum))
elif length == 2:
target_name += delay
myline="%s\t%d\n" % (length, target_name)
with open(filename, "w") as f:
f.write(myline)
return target_name
latcalc(filename1,lat40,0.4)
latcalc(filename2,lat80,0.8)
latcalc(filename3,lat100,1)
The way you treat the parameter target_name, I assume, you are used to C-type pointers which do not exist in that form in Python. The parameter is pointless here if you set it to a new value in the first line of latcalc(). Also, you seem to treat target_name as a string when it is an int:
myline="%s\t%s\n" % (length, target_name)
If you need target_name after the method has finished, you would have to return it.
1) open() gives you a filehandler, and not a filename
2) Use a "with" statement for opening a file, to avoid "forgetting" closing the file when finished.
#!/usr/bin/python
# include the constants
min_length = 1
max_length = 30
# delays
delay = 100
# Speed of light
c_vaccum = 3e8
def latcalc(filename, target_name, vf):
with open(filename, "w") as openedFile:
target_name = 0
for length in range(min_length, max_length):
if length < 2:
target_name += (length/(vf * c_vaccum))
elif length == 2:
target_name += delay
else:
target_name = target_name
myline="%s\t%s\n" % (length, target_name)
openedFile.write(myline)
latcalc("file1.txt", "lat40", 0.4)
latcalc("file2.txt", "lat80", 0.8)
latcalc("file3.txt", "lat100", 1)
I am doing text processing and using 'readline()' function as follows:
ifd = open(...)
for line in ifd:
while (condition)
do something...
line = ifd.readline()
condition = ....
#Here when the condition becomes false I need to rewind the pointer so that the 'for' loop read the same line again.
ifd.fseek() followed by readline is giving me a '\n' character. How to rewind the pointer so that the whole line is read again.
>>> ifd.seek(-1,1)
>>> line = ifd.readline()
>>> line
'\n'
Here is my code
labtestnames = sorted(tmp)
#Now read each line in the inFile and write into outFile
ifd = open(inFile, "r")
ofd = open(outFile, "w")
#read the header
header = ifd.readline() #Do nothing with this line. Skip
#Write header into the output file
nl = "mrn\tspecimen_id\tlab_number\tlogin_dt\tfluid"
offset = len(nl.split("\t"))
nl = nl + "\t" + "\t".join(labtestnames)
ofd.write(nl+"\n")
lenFields = len(nl.split("\t"))
print "Reading the input file and converting into modified file for further processing (correlation analysis etc..)"
prevTup = (0,0,0)
rowComplete = 0
k=0
for line in ifd:
k=k+1
if (k==200): break
items = line.rstrip("\n").split("\t")
if((items[0] =='')):
continue
newline= list('' for i in range(lenFields))
newline[0],newline[1],newline[3],newline[2],newline[4] = items[0], items[1], items[3], items[2], items[4]
ltests = []
ltvals = []
while(cmp(prevTup, (items[0], items[1], items[3])) == 0): # If the same mrn, lab_number and specimen_id then fill the same row. else create a new row.
ltests.append(items[6])
ltvals.append(items[7])
pos = ifd.tell()
line = ifd.readline()
prevTup = (items[0], items[1], items[3])
items = line.rstrip("\n").split("\t")
rowComplete = 1
if (rowComplete == 1): #If the row is completed, prepare newline and write into outfile
indices = [labtestnames.index(x) for x in ltests]
j=0
ifd.seek(pos)
for i in indices:
newline[i+offset] = ltvals[j]
j=j+1
if (rowComplete == 0): #
currTup = (items[0], items[1], items[3])
ltests = items[6]
ltvals = items[7]
pos = ifd.tell()
line = ifd.readline()
items = line.rstrip("\n").split("\t")
newTup = (items[0], items[1], items[3])
if(cmp(currTup, newTup) == 0):
prevTup = currTup
ifd.seek(pos)
continue
else:
indices = labtestnames.index(ltests)
newline[indices+offset] = ltvals
ofd.write(newline+"\n")
The problem can be handled more simply using itertools.groupby. groupby can cluster all the contiguous lines that deal with the same mrn, specimen_id, and lab_num.
The code that does this is
for key, group in IT.groupby(reader, key = mykey):
where reader iterates over the lines of the input file, and mykey is defined by
def mykey(row):
return (row['mrn'], row['specimen_id'], row['lab_num'])
Each row from reader is passed to mykey, and all rows with the same key are clustered together in the same group.
While we're at it, we might as well use the csv module to read each line into a dict (which I call row). This frees us from having to deal with low-level string manipulation like line.rstrip("\n").split("\t") and instead of referring to columns by index numbers (e.g. row[3]) we can write code that speaks in higher-level terms such as row['lab_num'].
import itertools as IT
import csv
inFile = 'curious.dat'
outFile = 'curious.out'
def mykey(row):
return (row['mrn'], row['specimen_id'], row['lab_num'])
fieldnames = 'mrn specimen_id date lab_num Bilirubin Lipase Calcium Magnesium Phosphate'.split()
with open(inFile, 'rb') as ifd:
reader = csv.DictReader(ifd, delimiter = '\t')
with open(outFile, 'wb') as ofd:
writer = csv.DictWriter(
ofd, fieldnames, delimiter = '\t', lineterminator = '\n', )
writer.writeheader()
for key, group in IT.groupby(reader, key = mykey):
new = {}
row = next(group)
for key in ('mrn', 'specimen_id', 'date', 'lab_num'):
new[key] = row[key]
new[row['labtest']] = row['result_val']
for row in group:
new[row['labtest']] = row['result_val']
writer.writerow(new)
yields
mrn specimen_id date lab_num Bilirubin Lipase Calcium Magnesium Phosphate
4419529 1614487 26.2675 5802791G 0.1
3319529 1614487 26.2675 5802791G 0.3 153 8.1 2.1 4
5713871 682571 56.0779 9732266E 4.1
This seems to be a perfect use case for yield expressions. Consider the following example that prints lines from a file, repeating some of them at random:
def buflines(fp):
r = None
while True:
r = yield r or next(fp)
if r:
yield None
from random import randint
with open('filename') as fp:
buf = buflines(fp)
for line in buf:
print line
if randint(1, 100) > 80:
print 'ONCE AGAIN::'
buf.send(line)
Basically, if you want to process an item once again, you send it back to the generator. On the next iteration you will be reading the same item once again.
I want to create a python program which splits up a files into segments of specified width, and then a consumer program takes the segments and creates a duplicate of the original file. The segments might be out of order so I intent to use the offset value to write to the file.
Is there a way I can achieve this with without creating a local array to hold all the data on the receiving end?
for example,
f = open(file, "wb")
f.seek(offset)
f.write(data)
The idea behind this is that the program that sends the file might not be able to finish sending the file, and will resume again once it has started.
I have a sample code below which the "combine_bytes" function throws an exception when I try placing data in the buffer location.
import sys
import os
def SplitFile(fname, start, end, width):
t_fileSize = os.path.getsize(fname)
buffData = bytearray(t_fileSize)
for line, offset in get_bytes(fname, int(start), int(end), int(width)):
combine_bytes(buffData, offset, line, width)
nums = ["%02x" % ord(c) for c in line]
print " ".join(nums)
f = open("Green_copy.jpg", "wb")
f.write(buffData)
f.close()
def combine_bytes(in_buff, in_offset, in_data, in_width):
#something like memcpy would be nice
#in_buff[in_offset:in_offset + in_width] = in_data
#this works but it's the mother of inefficiency
i = in_offset
for c in in_data:
in_buff.insert(i, c)
i = i + 1
def get_bytes(fname, start, end, width):
t_currOffset = start
t_width = width
f = open(fname, "r+b")
if end != 0:
while t_currOffset < end:
f.seek(t_currOffset)
if (t_currOffset + t_width) > end:
t_width = end - t_currOffset
t_data = f.read(t_width)
yield t_data,t_currOffset
t_currOffset += t_width
else:
f.seek(t_currOffset)
t_data = f.read(t_width)
while t_data:
yield t_data, t_currOffset
t_currOffset += t_width
f.seek(t_currOffset)
t_data = f.read(t_width)
f.close()
if __name__ == '__main__':
try:
SplitFile(*sys.argv[1:5])
except:
print "Unexpected error:", sys.exc_info()[0]
I still could nt figure out what is your intent - but this version of combine_bytes will get rid of your "mother of your inefficiency" part (which actually is exactly that)
def combine_bytes(in_buff, in_offset, in_data, in_width):
#something like memcpy would be nice
#in_buff[in_offset:in_offset + in_width] = in_data
in_buff = in_buff[:in_offset] + in_data + in_buff[in_offset:]
return in_buff
Of course this creates a new (larger) buffer for each call, and you have to replace your buffer on the caller scope with the one returned:
buffData = combine_bytes(buffData, offset, line, width)
Found it. here is a better way which produces the what I wanted and is faster. _buffData[t_offset:t_offset + len(t_data)] = bytearray(t_data)