Pythonic way to keep track of line number in stdin [duplicate] - python

How do I get a line count of a large file in the most memory- and time-efficient manner?
def file_len(filename):
with open(filename) as f:
for i, _ in enumerate(f):
pass
return i + 1

One line, probably pretty fast:
num_lines = sum(1 for line in open('myfile.txt'))

You can't get any better than that.
After all, any solution will have to read the entire file, figure out how many \n you have, and return that result.
Do you have a better way of doing that without reading the entire file? Not sure... The best solution will always be I/O-bound, best you can do is make sure you don't use unnecessary memory, but it looks like you have that covered.

I believe that a memory mapped file will be the fastest solution. I tried four functions: the function posted by the OP (opcount); a simple iteration over the lines in the file (simplecount); readline with a memory-mapped filed (mmap) (mapcount); and the buffer read solution offered by Mykola Kharechko (bufcount).
I ran each function five times, and calculated the average run-time for a 1.2 million-line text file.
Windows XP, Python 2.5, 2GB RAM, 2 GHz AMD processor
Here are my results:
mapcount : 0.465599966049
simplecount : 0.756399965286
bufcount : 0.546800041199
opcount : 0.718600034714
Edit: numbers for Python 2.6:
mapcount : 0.471799945831
simplecount : 0.634400033951
bufcount : 0.468800067902
opcount : 0.602999973297
So the buffer read strategy seems to be the fastest for Windows/Python 2.6
Here is the code:
from __future__ import with_statement
import time
import mmap
import random
from collections import defaultdict
def mapcount(filename):
f = open(filename, "r+")
buf = mmap.mmap(f.fileno(), 0)
lines = 0
readline = buf.readline
while readline():
lines += 1
return lines
def simplecount(filename):
lines = 0
for line in open(filename):
lines += 1
return lines
def bufcount(filename):
f = open(filename)
lines = 0
buf_size = 1024 * 1024
read_f = f.read # loop optimization
buf = read_f(buf_size)
while buf:
lines += buf.count('\n')
buf = read_f(buf_size)
return lines
def opcount(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
counts = defaultdict(list)
for i in range(5):
for func in [mapcount, simplecount, bufcount, opcount]:
start_time = time.time()
assert func("big_file.txt") == 1209138
counts[func].append(time.time() - start_time)
for key, vals in counts.items():
print key.__name__, ":", sum(vals) / float(len(vals))

I had to post this on a similar question until my reputation score jumped a bit (thanks to whoever bumped me!).
All of these solutions ignore one way to make this run considerably faster, namely by using the unbuffered (raw) interface, using bytearrays, and doing your own buffering. (This only applies in Python 3. In Python 2, the raw interface may or may not be used by default, but in Python 3, you'll default into Unicode.)
Using a modified version of the timing tool, I believe the following code is faster (and marginally more pythonic) than any of the solutions offered:
def rawcount(filename):
f = open(filename, 'rb')
lines = 0
buf_size = 1024 * 1024
read_f = f.raw.read
buf = read_f(buf_size)
while buf:
lines += buf.count(b'\n')
buf = read_f(buf_size)
return lines
Using a separate generator function, this runs a smidge faster:
def _make_gen(reader):
b = reader(1024 * 1024)
while b:
yield b
b = reader(1024*1024)
def rawgencount(filename):
f = open(filename, 'rb')
f_gen = _make_gen(f.raw.read)
return sum( buf.count(b'\n') for buf in f_gen )
This can be done completely with generators expressions in-line using itertools, but it gets pretty weird looking:
from itertools import (takewhile,repeat)
def rawincount(filename):
f = open(filename, 'rb')
bufgen = takewhile(lambda x: x, (f.raw.read(1024*1024) for _ in repeat(None)))
return sum( buf.count(b'\n') for buf in bufgen )
Here are my timings:
function average, s min, s ratio
rawincount 0.0043 0.0041 1.00
rawgencount 0.0044 0.0042 1.01
rawcount 0.0048 0.0045 1.09
bufcount 0.008 0.0068 1.64
wccount 0.01 0.0097 2.35
itercount 0.014 0.014 3.41
opcount 0.02 0.02 4.83
kylecount 0.021 0.021 5.05
simplecount 0.022 0.022 5.25
mapcount 0.037 0.031 7.46

You could execute a subprocess and run wc -l filename
import subprocess
def file_len(fname):
p = subprocess.Popen(['wc', '-l', fname], stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
result, err = p.communicate()
if p.returncode != 0:
raise IOError(err)
return int(result.strip().split()[0])

After a perfplot analysis, one has to recommend the buffered read solution
def buf_count_newlines_gen(fname):
def _make_gen(reader):
while True:
b = reader(2 ** 16)
if not b: break
yield b
with open(fname, "rb") as f:
count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
return count
It's fast and memory-efficient. Most other solutions are about 20 times slower.
Code to reproduce the plot:
import mmap
import subprocess
from functools import partial
import perfplot
def setup(n):
fname = "t.txt"
with open(fname, "w") as f:
for i in range(n):
f.write(str(i) + "\n")
return fname
def for_enumerate(fname):
i = 0
with open(fname) as f:
for i, _ in enumerate(f):
pass
return i + 1
def sum1(fname):
return sum(1 for _ in open(fname))
def mmap_count(fname):
with open(fname, "r+") as f:
buf = mmap.mmap(f.fileno(), 0)
lines = 0
while buf.readline():
lines += 1
return lines
def for_open(fname):
lines = 0
for _ in open(fname):
lines += 1
return lines
def buf_count_newlines(fname):
lines = 0
buf_size = 2 ** 16
with open(fname) as f:
buf = f.read(buf_size)
while buf:
lines += buf.count("\n")
buf = f.read(buf_size)
return lines
def buf_count_newlines_gen(fname):
def _make_gen(reader):
b = reader(2 ** 16)
while b:
yield b
b = reader(2 ** 16)
with open(fname, "rb") as f:
count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
return count
def wc_l(fname):
return int(subprocess.check_output(["wc", "-l", fname]).split()[0])
def sum_partial(fname):
with open(fname) as f:
count = sum(x.count("\n") for x in iter(partial(f.read, 2 ** 16), ""))
return count
def read_count(fname):
return open(fname).read().count("\n")
b = perfplot.bench(
setup=setup,
kernels=[
for_enumerate,
sum1,
mmap_count,
for_open,
wc_l,
buf_count_newlines,
buf_count_newlines_gen,
sum_partial,
read_count,
],
n_range=[2 ** k for k in range(27)],
xlabel="num lines",
)
b.save("out.png")
b.show()

Here is a python program to use the multiprocessing library to distribute the line counting across machines/cores. My test improves counting a 20million line file from 26 seconds to 7 seconds using an 8 core windows 64 server. Note: not using memory mapping makes things much slower.
import multiprocessing, sys, time, os, mmap
import logging, logging.handlers
def init_logger(pid):
console_format = 'P{0} %(levelname)s %(message)s'.format(pid)
logger = logging.getLogger() # New logger at root level
logger.setLevel( logging.INFO )
logger.handlers.append( logging.StreamHandler() )
logger.handlers[0].setFormatter( logging.Formatter( console_format, '%d/%m/%y %H:%M:%S' ) )
def getFileLineCount( queues, pid, processes, file1 ):
init_logger(pid)
logging.info( 'start' )
physical_file = open(file1, "r")
# mmap.mmap(fileno, length[, tagname[, access[, offset]]]
m1 = mmap.mmap( physical_file.fileno(), 0, access=mmap.ACCESS_READ )
#work out file size to divide up line counting
fSize = os.stat(file1).st_size
chunk = (fSize / processes) + 1
lines = 0
#get where I start and stop
_seedStart = chunk * (pid)
_seekEnd = chunk * (pid+1)
seekStart = int(_seedStart)
seekEnd = int(_seekEnd)
if seekEnd < int(_seekEnd + 1):
seekEnd += 1
if _seedStart < int(seekStart + 1):
seekStart += 1
if seekEnd > fSize:
seekEnd = fSize
#find where to start
if pid > 0:
m1.seek( seekStart )
#read next line
l1 = m1.readline() # need to use readline with memory mapped files
seekStart = m1.tell()
#tell previous rank my seek start to make their seek end
if pid > 0:
queues[pid-1].put( seekStart )
if pid < processes-1:
seekEnd = queues[pid].get()
m1.seek( seekStart )
l1 = m1.readline()
while len(l1) > 0:
lines += 1
l1 = m1.readline()
if m1.tell() > seekEnd or len(l1) == 0:
break
logging.info( 'done' )
# add up the results
if pid == 0:
for p in range(1,processes):
lines += queues[0].get()
queues[0].put(lines) # the total lines counted
else:
queues[0].put(lines)
m1.close()
physical_file.close()
if __name__ == '__main__':
init_logger( 'main' )
if len(sys.argv) > 1:
file_name = sys.argv[1]
else:
logging.fatal( 'parameters required: file-name [processes]' )
exit()
t = time.time()
processes = multiprocessing.cpu_count()
if len(sys.argv) > 2:
processes = int(sys.argv[2])
queues=[] # a queue for each process
for pid in range(processes):
queues.append( multiprocessing.Queue() )
jobs=[]
prev_pipe = 0
for pid in range(processes):
p = multiprocessing.Process( target = getFileLineCount, args=(queues, pid, processes, file_name,) )
p.start()
jobs.append(p)
jobs[0].join() #wait for counting to finish
lines = queues[0].get()
logging.info( 'finished {} Lines:{}'.format( time.time() - t, lines ) )

A one-line bash solution similar to this answer, using the modern subprocess.check_output function:
def line_count(filename):
return int(subprocess.check_output(['wc', '-l', filename]).split()[0])

I would use Python's file object method readlines, as follows:
with open(input_file) as foo:
lines = len(foo.readlines())
This opens the file, creates a list of lines in the file, counts the length of the list, saves that to a variable and closes the file again.

This is the fastest thing I have found using pure python.
You can use whatever amount of memory you want by setting buffer, though 2**16 appears to be a sweet spot on my computer.
from functools import partial
buffer=2**16
with open(myfile) as f:
print sum(x.count('\n') for x in iter(partial(f.read,buffer), ''))
I found the answer here Why is reading lines from stdin much slower in C++ than Python? and tweaked it just a tiny bit. Its a very good read to understand how to count lines quickly, though wc -l is still about 75% faster than anything else.

def file_len(full_path):
""" Count number of lines in a file."""
f = open(full_path)
nr_of_lines = sum(1 for line in f)
f.close()
return nr_of_lines

Here is what I use, seems pretty clean:
import subprocess
def count_file_lines(file_path):
"""
Counts the number of lines in a file using wc utility.
:param file_path: path to file
:return: int, no of lines
"""
num = subprocess.check_output(['wc', '-l', file_path])
num = num.split(' ')
return int(num[0])
UPDATE: This is marginally faster than using pure python but at the cost of memory usage. Subprocess will fork a new process with the same memory footprint as the parent process while it executes your command.

One line solution:
import os
os.system("wc -l filename")
My snippet:
>>> os.system('wc -l *.txt')
0 bar.txt
1000 command.txt
3 test_file.txt
1003 total

Kyle's answer
num_lines = sum(1 for line in open('my_file.txt'))
is probably best, an alternative for this is
num_lines = len(open('my_file.txt').read().splitlines())
Here is the comparision of performance of both
In [20]: timeit sum(1 for line in open('Charts.ipynb'))
100000 loops, best of 3: 9.79 µs per loop
In [21]: timeit len(open('Charts.ipynb').read().splitlines())
100000 loops, best of 3: 12 µs per loop

I got a small (4-8%) improvement with this version which re-uses a constant buffer so it should avoid any memory or GC overhead:
lines = 0
buffer = bytearray(2048)
with open(filename) as f:
while f.readinto(buffer) > 0:
lines += buffer.count('\n')
You can play around with the buffer size and maybe see a little improvement.

Just to complete the above methods I tried a variant with the fileinput module:
import fileinput as fi
def filecount(fname):
for line in fi.input(fname):
pass
return fi.lineno()
And passed a 60mil lines file to all the above stated methods:
mapcount : 6.1331050396
simplecount : 4.588793993
opcount : 4.42918205261
filecount : 43.2780818939
bufcount : 0.170812129974
It's a little surprise to me that fileinput is that bad and scales far worse than all the other methods...

As for me this variant will be the fastest:
#!/usr/bin/env python
def main():
f = open('filename')
lines = 0
buf_size = 1024 * 1024
read_f = f.read # loop optimization
buf = read_f(buf_size)
while buf:
lines += buf.count('\n')
buf = read_f(buf_size)
print lines
if __name__ == '__main__':
main()
reasons: buffering faster than reading line by line and string.count is also very fast

This code is shorter and clearer. It's probably the best way:
num_lines = open('yourfile.ext').read().count('\n')

I have modified the buffer case like this:
def CountLines(filename):
f = open(filename)
try:
lines = 1
buf_size = 1024 * 1024
read_f = f.read # loop optimization
buf = read_f(buf_size)
# Empty file
if not buf:
return 0
while buf:
lines += buf.count('\n')
buf = read_f(buf_size)
return lines
finally:
f.close()
Now also empty files and the last line (without \n) are counted.

print open('file.txt', 'r').read().count("\n") + 1

A lot of answers already, but unfortunately most of them are just tiny economies on a barely optimizable problem...
I worked on several projects where line count was the core function of the software, and working as fast as possible with a huge number of files was of paramount importance.
The main bottleneck with line count is I/O access, as you need to read each line in order to detect the line return character, there is simply no way around. The second potential bottleneck is memory management: the more you load at once, the faster you can process, but this bottleneck is negligible compared to the first.
Hence, there are 3 major ways to reduce the processing time of a line count function, apart from tiny optimizations such as disabling gc collection and other micro-managing tricks:
Hardware solution: the major and most obvious way is non-programmatic: buy a very fast SSD/flash hard drive. By far, this is how you can get the biggest speed boosts.
Data preparation solution: if you generate or can modify how the files you process are generated, or if it's acceptable that you can pre-process them, first convert the line return to unix style (\n) as this will save 1 character compared to Windows or MacOS styles (not a big save but it's an easy gain), and secondly and most importantly, you can potentially write lines of fixed length. If you need variable length, you can always pad smaller lines. This way, you can calculate instantly the number of lines from the total filesize, which is much faster to access. Often, the best solution to a problem is to pre-process it so that it better fits your end purpose.
Parallelization + hardware solution: if you can buy multiple hard disks (and if possible SSD flash disks), then you can even go beyond the speed of one disk by leveraging parallelization, by storing your files in a balanced way (easiest is to balance by total size) among disks, and then read in parallel from all those disks. Then, you can expect to get a multiplier boost in proportion with the number of disks you have. If buying multiple disks is not an option for you, then parallelization likely won't help (except if your disk has multiple reading headers like some professional-grade disks, but even then the disk's internal cache memory and PCB circuitry will likely be a bottleneck and prevent you from fully using all heads in parallel, plus you have to devise a specific code for this hard drive you'll use because you need to know the exact cluster mapping so that you store your files on clusters under different heads, and so that you can read them with different heads after). Indeed, it's commonly known that sequential reading is almost always faster than random reading, and parallelization on a single disk will have a performance more similar to random reading than sequential reading (you can test your hard drive speed in both aspects using CrystalDiskMark for example).
If none of those are an option, then you can only rely on micro-managing tricks to improve by a few percents the speed of your line counting function, but don't expect anything really significant. Rather, you can expect the time you'll spend tweaking will be disproportionated compared to the returns in speed improvement you'll see.

Simple method:
1)
>>> f = len(open("myfile.txt").readlines())
>>> f
430
>>> f = open("myfile.txt").read().count('\n')
>>> f
430
>>>
num_lines = len(list(open('myfile.txt')))

If one wants to get the line count cheaply in Python in Linux, I recommend this method:
import os
print os.popen("wc -l file_path").readline().split()[0]
file_path can be both abstract file path or relative path. Hope this may help.

def count_text_file_lines(path):
with open(path, 'rt') as file:
line_count = sum(1 for _line in file)
return line_count

the result of opening a file is an iterator, which can be converted to a sequence, which has a length:
with open(filename) as f:
return len(list(f))
this is more concise than your explicit loop, and avoids the enumerate.

What about this
def file_len(fname):
counts = itertools.count()
with open(fname) as f:
for _ in f: counts.next()
return counts.next()

count = max(enumerate(open(filename)))[0]

How about this?
import fileinput
import sys
counter=0
for line in fileinput.input([sys.argv[1]]):
counter+=1
fileinput.close()
print counter

How about this one-liner:
file_length = len(open('myfile.txt','r').read().split('\n'))
Takes 0.003 sec using this method to time it on a 3900 line file
def c():
import time
s = time.time()
file_length = len(open('myfile.txt','r').read().split('\n'))
print time.time() - s

def line_count(path):
count = 0
with open(path) as lines:
for count, l in enumerate(lines, start=1):
pass
return count

Related

Read a file in chunks of multiple lines [duplicate]

I am writing a code to take an enormous textfile (several GB) N lines at a time, process that batch, and move onto the next N lines until I have completed the entire file. (I don't care if the last batch isn't the perfect size).
I have been reading about using itertools islice for this operation. I think I am halfway there:
from itertools import islice
N = 16
infile = open("my_very_large_text_file", "r")
lines_gen = islice(infile, N)
for lines in lines_gen:
...process my lines...
The trouble is that I would like to process the next batch of 16 lines, but I am missing something
islice() can be used to get the next n items of an iterator. Thus, list(islice(f, n)) will return a list of the next n lines of the file f. Using this inside a loop will give you the file in chunks of n lines. At the end of the file, the list might be shorter, and finally the call will return an empty list.
from itertools import islice
with open(...) as f:
while True:
next_n_lines = list(islice(f, n))
if not next_n_lines:
break
# process next_n_lines
An alternative is to use the grouper pattern:
from itertools import zip_longest
with open(...) as f:
for next_n_lines in zip_longest(*[f] * n):
# process next_n_lines
The question appears to presume that there is efficiency to be gained by reading an "enormous textfile" in blocks of N lines at a time. This adds an application layer of buffering over the already highly optimized stdio library, adds complexity, and probably buys you absolutely nothing.
Thus:
with open('my_very_large_text_file') as f:
for line in f:
process(line)
is probably superior to any alternative in time, space, complexity and readability.
See also Rob Pike's first two rules, Jackson's Two Rules, and PEP-20 The Zen of Python. If you really just wanted to play with islice you should have left out the large file stuff.
Here is another way using groupby:
from itertools import count, groupby
N = 16
with open('test') as f:
for g, group in groupby(f, key=lambda _, c=count(): c.next()/N):
print list(group)
How it works:
Basically groupby() will group the lines by the return value of the key parameter and the key parameter is the lambda function lambda _, c=count(): c.next()/N and using the fact that the c argument will be bound to count() when the function will be defined so each time groupby() will call the lambda function and evaluate the return value to determine the grouper that will group the lines so :
# 1 iteration.
c.next() => 0
0 / 16 => 0
# 2 iteration.
c.next() => 1
1 / 16 => 0
...
# Start of the second grouper.
c.next() => 16
16/16 => 1
...
Since the requirement was added that there be statistically uniform distribution of the lines selected from the file, I offer this simple approach.
"""randsamp - extract a random subset of n lines from a large file"""
import random
def scan_linepos(path):
"""return a list of seek offsets of the beginning of each line"""
linepos = []
offset = 0
with open(path) as inf:
# WARNING: CPython 2.7 file.tell() is not accurate on file.next()
for line in inf:
linepos.append(offset)
offset += len(line)
return linepos
def sample_lines(path, linepos, nsamp):
"""return nsamp lines from path where line offsets are in linepos"""
offsets = random.sample(linepos, nsamp)
offsets.sort() # this may make file reads more efficient
lines = []
with open(path) as inf:
for offset in offsets:
inf.seek(offset)
lines.append(inf.readline())
return lines
dataset = 'big_data.txt'
nsamp = 5
linepos = scan_linepos(dataset) # the scan only need be done once
lines = sample_lines(dataset, linepos, nsamp)
print 'selecting %d lines from a file of %d' % (nsamp, len(linepos))
print ''.join(lines)
I tested it on a mock data file of 3 million lines comprising 1.7GB on disk. The scan_linepos dominated the runtime taking about 20 seconds on my not-so-hot desktop.
Just to check the performance of sample_lines I used the timeit module as so
import timeit
t = timeit.Timer('sample_lines(dataset, linepos, nsamp)',
'from __main__ import sample_lines, dataset, linepos, nsamp')
trials = 10 ** 4
elapsed = t.timeit(number=trials)
print u'%dk trials in %.2f seconds, %.2fµs per trial' % (trials/1000,
elapsed, (elapsed/trials) * (10 ** 6))
For various values of nsamp; when nsamp was 100, a single sample_lines completed in 460µs and scaled linearly up to 10k samples at 47ms per call.
The natural next question is Random is barely random at all?, and the answer is "sub-cryptographic but certainly fine for bioinformatics".
Used chunker function from What is the most “pythonic” way to iterate over a list in chunks?:
from itertools import izip_longest
def grouper(iterable, n, fillvalue=None):
"grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
return izip_longest(*args, fillvalue=fillvalue)
with open(filename) as f:
for lines in grouper(f, chunk_size, ""): #for every chunk_sized chunk
"""process lines like
lines[0], lines[1] , ... , lines[chunk_size-1]"""
Assuming "batch" means to want to process all 16 recs at one time instead of individually, read the file one record at a time and update a counter; when the counter hits 16, process that group. interim_list = []
infile = open("my_very_large_text_file", "r")
ctr = 0
for rec in infile:
interim_list.append(rec)
ctr += 1
if ctr > 15:
process_list(interim_list)
interim_list = []
ctr = 0
the final group
process_list(interim_list)
Another solution might be to create an iterator that yields lists of n elements:
def n_elements(n, it):
try:
while True:
yield [next(it) for j in range(0, n)]
except StopIteration:
return
with open(filename, 'rt') as f:
for n_lines in n_elements(n, f):
do_stuff(n_lines)

Extract text in string until certain new line ("\n") [duplicate]

We have a large raw data file that we would like to trim to a specified size.
How would I go about getting the first N lines of a text file in python? Will the OS being used have any effect on the implementation?
Python 3:
with open(path_to_file) as input_file:
head = [next(input_file) for _ in range(lines_number)]
print(head)
Python 2:
with open(path_to_file) as input_file:
head = [next(input_file) for _ in xrange(lines_number)]
print head
Here's another way (both Python 2 & 3):
from itertools import islice
with open(path_to_file) as input_file:
head = list(islice(path_to_file, lines_number))
print(head)
N = 10
with open("file.txt", "a") as file: # the a opens it in append mode
for i in range(N):
line = next(file).strip()
print(line)
If you want to read the first lines quickly and you don't care about performance you can use .readlines() which returns list object and then slice the list.
E.g. for the first 5 lines:
with open("pathofmyfileandfileandname") as myfile:
firstNlines=myfile.readlines()[0:5] #put here the interval you want
Note: the whole file is read so is not the best from the performance point of view but it
is easy to use, fast to write and easy to remember so if you want just perform
some one-time calculation is very convenient
print firstNlines
One advantage compared to the other answers is the possibility to select easily the range of lines e.g. skipping the first 10 lines [10:30] or the lasts 10 [:-10] or taking only even lines [::2].
What I do is to call the N lines using pandas. I think the performance is not the best, but for example if N=1000:
import pandas as pd
yourfile = pd.read_csv('path/to/your/file.csv',nrows=1000)
There is no specific method to read number of lines exposed by file object.
I guess the easiest way would be following:
lines =[]
with open(file_name) as f:
lines.extend(f.readline() for i in xrange(N))
The two most intuitive ways of doing this would be:
Iterate on the file line-by-line, and break after N lines.
Iterate on the file line-by-line using the next() method N times. (This is essentially just a different syntax for what the top answer does.)
Here is the code:
# Method 1:
with open("fileName", "r") as f:
counter = 0
for line in f:
print line
counter += 1
if counter == N: break
# Method 2:
with open("fileName", "r") as f:
for i in xrange(N):
line = f.next()
print line
The bottom line is, as long as you don't use readlines() or enumerateing the whole file into memory, you have plenty of options.
Based on gnibbler top voted answer (Nov 20 '09 at 0:27): this class add head() and tail() method to file object.
class File(file):
def head(self, lines_2find=1):
self.seek(0) #Rewind file
return [self.next() for x in xrange(lines_2find)]
def tail(self, lines_2find=1):
self.seek(0, 2) #go to end of file
bytes_in_file = self.tell()
lines_found, total_bytes_scanned = 0, 0
while (lines_2find+1 > lines_found and
bytes_in_file > total_bytes_scanned):
byte_block = min(1024, bytes_in_file-total_bytes_scanned)
self.seek(-(byte_block+total_bytes_scanned), 2)
total_bytes_scanned += byte_block
lines_found += self.read(1024).count('\n')
self.seek(-total_bytes_scanned, 2)
line_list = list(self.readlines())
return line_list[-lines_2find:]
Usage:
f = File('path/to/file', 'r')
f.head(3)
f.tail(3)
most convinient way on my own:
LINE_COUNT = 3
print [s for (i, s) in enumerate(open('test.txt')) if i < LINE_COUNT]
Solution based on List Comprehension
The function open() supports an iteration interface. The enumerate() covers open() and return tuples (index, item), then we check that we're inside an accepted range (if i < LINE_COUNT) and then simply print the result.
Enjoy the Python. ;)
For first 5 lines, simply do:
N=5
with open("data_file", "r") as file:
for i in range(N):
print file.next()
If you want something that obviously (without looking up esoteric stuff in manuals) works without imports and try/except and works on a fair range of Python 2.x versions (2.2 to 2.6):
def headn(file_name, n):
"""Like *x head -N command"""
result = []
nlines = 0
assert n >= 1
for line in open(file_name):
result.append(line)
nlines += 1
if nlines >= n:
break
return result
if __name__ == "__main__":
import sys
rval = headn(sys.argv[1], int(sys.argv[2]))
print rval
print len(rval)
If you have a really big file, and assuming you want the output to be a numpy array, using np.genfromtxt will freeze your computer. This is so much better in my experience:
def load_big_file(fname,maxrows):
'''only works for well-formed text file of space-separated doubles'''
rows = [] # unknown number of lines, so use list
with open(fname) as f:
j=0
for line in f:
if j==maxrows:
break
else:
line = [float(s) for s in line.split()]
rows.append(np.array(line, dtype = np.double))
j+=1
return np.vstack(rows) # convert list of vectors to array
This worked for me
f = open("history_export.csv", "r")
line= 5
for x in range(line):
a = f.readline()
print(a)
I would like to handle the file with less than n-lines by reading the whole file
def head(filename: str, n: int):
try:
with open(filename) as f:
head_lines = [next(f).rstrip() for x in range(n)]
except StopIteration:
with open(filename) as f:
head_lines = f.read().splitlines()
return head_lines
Credit go to John La Rooy and Ilian Iliev. Use the function for the best performance with exception handle
Revise 1: Thanks FrankM for the feedback, to handle file existence and read permission we can futher add
import errno
import os
def head(filename: str, n: int):
if not os.path.isfile(filename):
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), filename)
if not os.access(filename, os.R_OK):
raise PermissionError(errno.EACCES, os.strerror(errno.EACCES), filename)
try:
with open(filename) as f:
head_lines = [next(f).rstrip() for x in range(n)]
except StopIteration:
with open(filename) as f:
head_lines = f.read().splitlines()
return head_lines
You can either go with second version or go with the first one and handle the file exception later. The check is quick and mostly free from performance standpoint
Starting at Python 2.6, you can take advantage of more sophisticated functions in the IO base clase. So the top rated answer above can be rewritten as:
with open("datafile") as myfile:
head = myfile.readlines(N)
print head
(You don't have to worry about your file having less than N lines since no StopIteration exception is thrown.)
This works for Python 2 & 3:
from itertools import islice
with open('/tmp/filename.txt') as inf:
for line in islice(inf, N, N+M):
print(line)
fname = input("Enter file name: ")
num_lines = 0
with open(fname, 'r') as f: #lines count
for line in f:
num_lines += 1
num_lines_input = int (input("Enter line numbers: "))
if num_lines_input <= num_lines:
f = open(fname, "r")
for x in range(num_lines_input):
a = f.readline()
print(a)
else:
f = open(fname, "r")
for x in range(num_lines_input):
a = f.readline()
print(a)
print("Don't have", num_lines_input, " lines print as much as you can")
print("Total lines in the text",num_lines)
Here's another decent solution with a list comprehension:
file = open('file.txt', 'r')
lines = [next(file) for x in range(3)] # first 3 lines will be in this list
file.close()
An easy way to get first 10 lines:
with open('fileName.txt', mode = 'r') as file:
list = [line.rstrip('\n') for line in file][:10]
print(list)
#!/usr/bin/python
import subprocess
p = subprocess.Popen(["tail", "-n 3", "passlist"], stdout=subprocess.PIPE)
output, err = p.communicate()
print output
This Method Worked for me
Simply Convert your CSV file object to a list using list(file_data)
import csv;
with open('your_csv_file.csv') as file_obj:
file_data = csv.reader(file_obj);
file_list = list(file_data)
for row in file_list[:4]:
print(row)

Performance issue when parsing a large file line by line

I have a set of several millions of small numbers stored in a file
I wrote a Python script that reads numbers from a tab delimited text file line by line, computes the reminders and appends the result to an output file. For some reason it consumes a lot of ram (20 Gb of ram on Ubuntu to parse a million of numbers). It also freezes the system because of frequent writes.
What is the correct way to tweak this script.
import os
import re
my_path = '/media/me/mSata/res/'
# output_file.open() before the first loop didn't help
for file_id in range (10,11): #10,201
filename = my_path + "in" + str(file_id) + ".txt"
fstr0 = ""+my_path +"out"+ str(file_id)+"_0.log"
fstr1 = ""+my_path +"res"+ str(file_id)+"_1.log"
with open(filename) as fp:
stats = [0] * (512)
line = fp.readline()
while line:
raw_line = line.strip()
arr_of_parsed_numbers = re.split(r'\t+', raw_line.rstrip('\t'))
for num_index in range(0, len(arr_of_parsed_numbers)):
my_number = int(arr_of_parsed_numbers[num_index])
v0 = (my_number % 257) -1 #value 257 is correct
my_number = (my_number )//257
stats[v0] += 1
v1 = my_number % 256
stats[256+v1]+=1
f0 = open(fstr0, "a")
f1 = open(fstr1, "a")
f0.write("{}\n".format(str(v0).rjust(3)))
f1.write("{}\n".format(str(v1).rjust(3)))
f0.close()
f1.close()
line=fp.readLine()
print(stats)
# tried output_file.close() here as well
print("done")
Updated:
I've ran this script under Windows 10 (10 Mb memory in Python.exe) and Ubuntu (10 Gb memory consumed). What can cause this discrepancy? Thousand times more is a lot.
his script consumes about 20Mb on Windows 10 (looking o
try something like this. Note the files are only opened and closed once each, and the loop iterates once per line.
import os
import re
my_path = '/media/me/mSata/res/'
# output_file.open() before the first loop didn't help
for file_id in range (10,11): #10,201
filename = my_path + "in" + str(file_id) + ".txt"
fstr0 = ""+my_path +"out"+ str(file_id)+"_0.log"
fstr1 = ""+my_path +"res"+ str(file_id)+"_1.log"
with open(filename, "r") as fp, open(fstr0, "a") as f0, open(fstr1, "a") as f1:
stats = [0] * (512)
for line in fp:
raw_line = line.strip()
arr_of_parsed_numbers = re.split(r'\t+', raw_line.rstrip('\t'))
for num_index in range(0, len(arr_of_parsed_numbers)):
my_number = int(arr_of_parsed_numbers[num_index])
v0 = (my_number % 257) -1 #value 257 is correct
my_number = (my_number )//257
stats[v0] += 1
v1 = my_number % 256
stats[256+v1]+=1
f0.write("{}\n".format(str(v0).rjust(3)))
f1.write("{}\n".format(str(v1).rjust(3)))
print(stats)
# tried output_file.close() here as well
print("done")

Fastest way to find total lines in a text file [duplicate]

How do I get a line count of a large file in the most memory- and time-efficient manner?
def file_len(filename):
with open(filename) as f:
for i, _ in enumerate(f):
pass
return i + 1
One line, probably pretty fast:
num_lines = sum(1 for line in open('myfile.txt'))
You can't get any better than that.
After all, any solution will have to read the entire file, figure out how many \n you have, and return that result.
Do you have a better way of doing that without reading the entire file? Not sure... The best solution will always be I/O-bound, best you can do is make sure you don't use unnecessary memory, but it looks like you have that covered.
I believe that a memory mapped file will be the fastest solution. I tried four functions: the function posted by the OP (opcount); a simple iteration over the lines in the file (simplecount); readline with a memory-mapped filed (mmap) (mapcount); and the buffer read solution offered by Mykola Kharechko (bufcount).
I ran each function five times, and calculated the average run-time for a 1.2 million-line text file.
Windows XP, Python 2.5, 2GB RAM, 2 GHz AMD processor
Here are my results:
mapcount : 0.465599966049
simplecount : 0.756399965286
bufcount : 0.546800041199
opcount : 0.718600034714
Edit: numbers for Python 2.6:
mapcount : 0.471799945831
simplecount : 0.634400033951
bufcount : 0.468800067902
opcount : 0.602999973297
So the buffer read strategy seems to be the fastest for Windows/Python 2.6
Here is the code:
from __future__ import with_statement
import time
import mmap
import random
from collections import defaultdict
def mapcount(filename):
f = open(filename, "r+")
buf = mmap.mmap(f.fileno(), 0)
lines = 0
readline = buf.readline
while readline():
lines += 1
return lines
def simplecount(filename):
lines = 0
for line in open(filename):
lines += 1
return lines
def bufcount(filename):
f = open(filename)
lines = 0
buf_size = 1024 * 1024
read_f = f.read # loop optimization
buf = read_f(buf_size)
while buf:
lines += buf.count('\n')
buf = read_f(buf_size)
return lines
def opcount(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
counts = defaultdict(list)
for i in range(5):
for func in [mapcount, simplecount, bufcount, opcount]:
start_time = time.time()
assert func("big_file.txt") == 1209138
counts[func].append(time.time() - start_time)
for key, vals in counts.items():
print key.__name__, ":", sum(vals) / float(len(vals))
I had to post this on a similar question until my reputation score jumped a bit (thanks to whoever bumped me!).
All of these solutions ignore one way to make this run considerably faster, namely by using the unbuffered (raw) interface, using bytearrays, and doing your own buffering. (This only applies in Python 3. In Python 2, the raw interface may or may not be used by default, but in Python 3, you'll default into Unicode.)
Using a modified version of the timing tool, I believe the following code is faster (and marginally more pythonic) than any of the solutions offered:
def rawcount(filename):
f = open(filename, 'rb')
lines = 0
buf_size = 1024 * 1024
read_f = f.raw.read
buf = read_f(buf_size)
while buf:
lines += buf.count(b'\n')
buf = read_f(buf_size)
return lines
Using a separate generator function, this runs a smidge faster:
def _make_gen(reader):
b = reader(1024 * 1024)
while b:
yield b
b = reader(1024*1024)
def rawgencount(filename):
f = open(filename, 'rb')
f_gen = _make_gen(f.raw.read)
return sum( buf.count(b'\n') for buf in f_gen )
This can be done completely with generators expressions in-line using itertools, but it gets pretty weird looking:
from itertools import (takewhile,repeat)
def rawincount(filename):
f = open(filename, 'rb')
bufgen = takewhile(lambda x: x, (f.raw.read(1024*1024) for _ in repeat(None)))
return sum( buf.count(b'\n') for buf in bufgen )
Here are my timings:
function average, s min, s ratio
rawincount 0.0043 0.0041 1.00
rawgencount 0.0044 0.0042 1.01
rawcount 0.0048 0.0045 1.09
bufcount 0.008 0.0068 1.64
wccount 0.01 0.0097 2.35
itercount 0.014 0.014 3.41
opcount 0.02 0.02 4.83
kylecount 0.021 0.021 5.05
simplecount 0.022 0.022 5.25
mapcount 0.037 0.031 7.46
You could execute a subprocess and run wc -l filename
import subprocess
def file_len(fname):
p = subprocess.Popen(['wc', '-l', fname], stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
result, err = p.communicate()
if p.returncode != 0:
raise IOError(err)
return int(result.strip().split()[0])
After a perfplot analysis, one has to recommend the buffered read solution
def buf_count_newlines_gen(fname):
def _make_gen(reader):
while True:
b = reader(2 ** 16)
if not b: break
yield b
with open(fname, "rb") as f:
count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
return count
It's fast and memory-efficient. Most other solutions are about 20 times slower.
Code to reproduce the plot:
import mmap
import subprocess
from functools import partial
import perfplot
def setup(n):
fname = "t.txt"
with open(fname, "w") as f:
for i in range(n):
f.write(str(i) + "\n")
return fname
def for_enumerate(fname):
i = 0
with open(fname) as f:
for i, _ in enumerate(f):
pass
return i + 1
def sum1(fname):
return sum(1 for _ in open(fname))
def mmap_count(fname):
with open(fname, "r+") as f:
buf = mmap.mmap(f.fileno(), 0)
lines = 0
while buf.readline():
lines += 1
return lines
def for_open(fname):
lines = 0
for _ in open(fname):
lines += 1
return lines
def buf_count_newlines(fname):
lines = 0
buf_size = 2 ** 16
with open(fname) as f:
buf = f.read(buf_size)
while buf:
lines += buf.count("\n")
buf = f.read(buf_size)
return lines
def buf_count_newlines_gen(fname):
def _make_gen(reader):
b = reader(2 ** 16)
while b:
yield b
b = reader(2 ** 16)
with open(fname, "rb") as f:
count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
return count
def wc_l(fname):
return int(subprocess.check_output(["wc", "-l", fname]).split()[0])
def sum_partial(fname):
with open(fname) as f:
count = sum(x.count("\n") for x in iter(partial(f.read, 2 ** 16), ""))
return count
def read_count(fname):
return open(fname).read().count("\n")
b = perfplot.bench(
setup=setup,
kernels=[
for_enumerate,
sum1,
mmap_count,
for_open,
wc_l,
buf_count_newlines,
buf_count_newlines_gen,
sum_partial,
read_count,
],
n_range=[2 ** k for k in range(27)],
xlabel="num lines",
)
b.save("out.png")
b.show()
Here is a python program to use the multiprocessing library to distribute the line counting across machines/cores. My test improves counting a 20million line file from 26 seconds to 7 seconds using an 8 core windows 64 server. Note: not using memory mapping makes things much slower.
import multiprocessing, sys, time, os, mmap
import logging, logging.handlers
def init_logger(pid):
console_format = 'P{0} %(levelname)s %(message)s'.format(pid)
logger = logging.getLogger() # New logger at root level
logger.setLevel( logging.INFO )
logger.handlers.append( logging.StreamHandler() )
logger.handlers[0].setFormatter( logging.Formatter( console_format, '%d/%m/%y %H:%M:%S' ) )
def getFileLineCount( queues, pid, processes, file1 ):
init_logger(pid)
logging.info( 'start' )
physical_file = open(file1, "r")
# mmap.mmap(fileno, length[, tagname[, access[, offset]]]
m1 = mmap.mmap( physical_file.fileno(), 0, access=mmap.ACCESS_READ )
#work out file size to divide up line counting
fSize = os.stat(file1).st_size
chunk = (fSize / processes) + 1
lines = 0
#get where I start and stop
_seedStart = chunk * (pid)
_seekEnd = chunk * (pid+1)
seekStart = int(_seedStart)
seekEnd = int(_seekEnd)
if seekEnd < int(_seekEnd + 1):
seekEnd += 1
if _seedStart < int(seekStart + 1):
seekStart += 1
if seekEnd > fSize:
seekEnd = fSize
#find where to start
if pid > 0:
m1.seek( seekStart )
#read next line
l1 = m1.readline() # need to use readline with memory mapped files
seekStart = m1.tell()
#tell previous rank my seek start to make their seek end
if pid > 0:
queues[pid-1].put( seekStart )
if pid < processes-1:
seekEnd = queues[pid].get()
m1.seek( seekStart )
l1 = m1.readline()
while len(l1) > 0:
lines += 1
l1 = m1.readline()
if m1.tell() > seekEnd or len(l1) == 0:
break
logging.info( 'done' )
# add up the results
if pid == 0:
for p in range(1,processes):
lines += queues[0].get()
queues[0].put(lines) # the total lines counted
else:
queues[0].put(lines)
m1.close()
physical_file.close()
if __name__ == '__main__':
init_logger( 'main' )
if len(sys.argv) > 1:
file_name = sys.argv[1]
else:
logging.fatal( 'parameters required: file-name [processes]' )
exit()
t = time.time()
processes = multiprocessing.cpu_count()
if len(sys.argv) > 2:
processes = int(sys.argv[2])
queues=[] # a queue for each process
for pid in range(processes):
queues.append( multiprocessing.Queue() )
jobs=[]
prev_pipe = 0
for pid in range(processes):
p = multiprocessing.Process( target = getFileLineCount, args=(queues, pid, processes, file_name,) )
p.start()
jobs.append(p)
jobs[0].join() #wait for counting to finish
lines = queues[0].get()
logging.info( 'finished {} Lines:{}'.format( time.time() - t, lines ) )
A one-line bash solution similar to this answer, using the modern subprocess.check_output function:
def line_count(filename):
return int(subprocess.check_output(['wc', '-l', filename]).split()[0])
I would use Python's file object method readlines, as follows:
with open(input_file) as foo:
lines = len(foo.readlines())
This opens the file, creates a list of lines in the file, counts the length of the list, saves that to a variable and closes the file again.
This is the fastest thing I have found using pure python.
You can use whatever amount of memory you want by setting buffer, though 2**16 appears to be a sweet spot on my computer.
from functools import partial
buffer=2**16
with open(myfile) as f:
print sum(x.count('\n') for x in iter(partial(f.read,buffer), ''))
I found the answer here Why is reading lines from stdin much slower in C++ than Python? and tweaked it just a tiny bit. Its a very good read to understand how to count lines quickly, though wc -l is still about 75% faster than anything else.
def file_len(full_path):
""" Count number of lines in a file."""
f = open(full_path)
nr_of_lines = sum(1 for line in f)
f.close()
return nr_of_lines
Here is what I use, seems pretty clean:
import subprocess
def count_file_lines(file_path):
"""
Counts the number of lines in a file using wc utility.
:param file_path: path to file
:return: int, no of lines
"""
num = subprocess.check_output(['wc', '-l', file_path])
num = num.split(' ')
return int(num[0])
UPDATE: This is marginally faster than using pure python but at the cost of memory usage. Subprocess will fork a new process with the same memory footprint as the parent process while it executes your command.
One line solution:
import os
os.system("wc -l filename")
My snippet:
>>> os.system('wc -l *.txt')
0 bar.txt
1000 command.txt
3 test_file.txt
1003 total
Kyle's answer
num_lines = sum(1 for line in open('my_file.txt'))
is probably best, an alternative for this is
num_lines = len(open('my_file.txt').read().splitlines())
Here is the comparision of performance of both
In [20]: timeit sum(1 for line in open('Charts.ipynb'))
100000 loops, best of 3: 9.79 µs per loop
In [21]: timeit len(open('Charts.ipynb').read().splitlines())
100000 loops, best of 3: 12 µs per loop
I got a small (4-8%) improvement with this version which re-uses a constant buffer so it should avoid any memory or GC overhead:
lines = 0
buffer = bytearray(2048)
with open(filename) as f:
while f.readinto(buffer) > 0:
lines += buffer.count('\n')
You can play around with the buffer size and maybe see a little improvement.
Just to complete the above methods I tried a variant with the fileinput module:
import fileinput as fi
def filecount(fname):
for line in fi.input(fname):
pass
return fi.lineno()
And passed a 60mil lines file to all the above stated methods:
mapcount : 6.1331050396
simplecount : 4.588793993
opcount : 4.42918205261
filecount : 43.2780818939
bufcount : 0.170812129974
It's a little surprise to me that fileinput is that bad and scales far worse than all the other methods...
As for me this variant will be the fastest:
#!/usr/bin/env python
def main():
f = open('filename')
lines = 0
buf_size = 1024 * 1024
read_f = f.read # loop optimization
buf = read_f(buf_size)
while buf:
lines += buf.count('\n')
buf = read_f(buf_size)
print lines
if __name__ == '__main__':
main()
reasons: buffering faster than reading line by line and string.count is also very fast
This code is shorter and clearer. It's probably the best way:
num_lines = open('yourfile.ext').read().count('\n')
I have modified the buffer case like this:
def CountLines(filename):
f = open(filename)
try:
lines = 1
buf_size = 1024 * 1024
read_f = f.read # loop optimization
buf = read_f(buf_size)
# Empty file
if not buf:
return 0
while buf:
lines += buf.count('\n')
buf = read_f(buf_size)
return lines
finally:
f.close()
Now also empty files and the last line (without \n) are counted.
print open('file.txt', 'r').read().count("\n") + 1
A lot of answers already, but unfortunately most of them are just tiny economies on a barely optimizable problem...
I worked on several projects where line count was the core function of the software, and working as fast as possible with a huge number of files was of paramount importance.
The main bottleneck with line count is I/O access, as you need to read each line in order to detect the line return character, there is simply no way around. The second potential bottleneck is memory management: the more you load at once, the faster you can process, but this bottleneck is negligible compared to the first.
Hence, there are 3 major ways to reduce the processing time of a line count function, apart from tiny optimizations such as disabling gc collection and other micro-managing tricks:
Hardware solution: the major and most obvious way is non-programmatic: buy a very fast SSD/flash hard drive. By far, this is how you can get the biggest speed boosts.
Data preparation solution: if you generate or can modify how the files you process are generated, or if it's acceptable that you can pre-process them, first convert the line return to unix style (\n) as this will save 1 character compared to Windows or MacOS styles (not a big save but it's an easy gain), and secondly and most importantly, you can potentially write lines of fixed length. If you need variable length, you can always pad smaller lines. This way, you can calculate instantly the number of lines from the total filesize, which is much faster to access. Often, the best solution to a problem is to pre-process it so that it better fits your end purpose.
Parallelization + hardware solution: if you can buy multiple hard disks (and if possible SSD flash disks), then you can even go beyond the speed of one disk by leveraging parallelization, by storing your files in a balanced way (easiest is to balance by total size) among disks, and then read in parallel from all those disks. Then, you can expect to get a multiplier boost in proportion with the number of disks you have. If buying multiple disks is not an option for you, then parallelization likely won't help (except if your disk has multiple reading headers like some professional-grade disks, but even then the disk's internal cache memory and PCB circuitry will likely be a bottleneck and prevent you from fully using all heads in parallel, plus you have to devise a specific code for this hard drive you'll use because you need to know the exact cluster mapping so that you store your files on clusters under different heads, and so that you can read them with different heads after). Indeed, it's commonly known that sequential reading is almost always faster than random reading, and parallelization on a single disk will have a performance more similar to random reading than sequential reading (you can test your hard drive speed in both aspects using CrystalDiskMark for example).
If none of those are an option, then you can only rely on micro-managing tricks to improve by a few percents the speed of your line counting function, but don't expect anything really significant. Rather, you can expect the time you'll spend tweaking will be disproportionated compared to the returns in speed improvement you'll see.
Simple method:
1)
>>> f = len(open("myfile.txt").readlines())
>>> f
430
>>> f = open("myfile.txt").read().count('\n')
>>> f
430
>>>
num_lines = len(list(open('myfile.txt')))
If one wants to get the line count cheaply in Python in Linux, I recommend this method:
import os
print os.popen("wc -l file_path").readline().split()[0]
file_path can be both abstract file path or relative path. Hope this may help.
def count_text_file_lines(path):
with open(path, 'rt') as file:
line_count = sum(1 for _line in file)
return line_count
the result of opening a file is an iterator, which can be converted to a sequence, which has a length:
with open(filename) as f:
return len(list(f))
this is more concise than your explicit loop, and avoids the enumerate.
What about this
def file_len(fname):
counts = itertools.count()
with open(fname) as f:
for _ in f: counts.next()
return counts.next()
count = max(enumerate(open(filename)))[0]
How about this?
import fileinput
import sys
counter=0
for line in fileinput.input([sys.argv[1]]):
counter+=1
fileinput.close()
print counter
How about this one-liner:
file_length = len(open('myfile.txt','r').read().split('\n'))
Takes 0.003 sec using this method to time it on a 3900 line file
def c():
import time
s = time.time()
file_length = len(open('myfile.txt','r').read().split('\n'))
print time.time() - s
def line_count(path):
count = 0
with open(path) as lines:
for count, l in enumerate(lines, start=1):
pass
return count

How to read first N lines of a file?

We have a large raw data file that we would like to trim to a specified size.
How would I go about getting the first N lines of a text file in python? Will the OS being used have any effect on the implementation?
Python 3:
with open(path_to_file) as input_file:
head = [next(input_file) for _ in range(lines_number)]
print(head)
Python 2:
with open(path_to_file) as input_file:
head = [next(input_file) for _ in xrange(lines_number)]
print head
Here's another way (both Python 2 & 3):
from itertools import islice
with open(path_to_file) as input_file:
head = list(islice(path_to_file, lines_number))
print(head)
N = 10
with open("file.txt", "a") as file: # the a opens it in append mode
for i in range(N):
line = next(file).strip()
print(line)
If you want to read the first lines quickly and you don't care about performance you can use .readlines() which returns list object and then slice the list.
E.g. for the first 5 lines:
with open("pathofmyfileandfileandname") as myfile:
firstNlines=myfile.readlines()[0:5] #put here the interval you want
Note: the whole file is read so is not the best from the performance point of view but it
is easy to use, fast to write and easy to remember so if you want just perform
some one-time calculation is very convenient
print firstNlines
One advantage compared to the other answers is the possibility to select easily the range of lines e.g. skipping the first 10 lines [10:30] or the lasts 10 [:-10] or taking only even lines [::2].
What I do is to call the N lines using pandas. I think the performance is not the best, but for example if N=1000:
import pandas as pd
yourfile = pd.read_csv('path/to/your/file.csv',nrows=1000)
There is no specific method to read number of lines exposed by file object.
I guess the easiest way would be following:
lines =[]
with open(file_name) as f:
lines.extend(f.readline() for i in xrange(N))
The two most intuitive ways of doing this would be:
Iterate on the file line-by-line, and break after N lines.
Iterate on the file line-by-line using the next() method N times. (This is essentially just a different syntax for what the top answer does.)
Here is the code:
# Method 1:
with open("fileName", "r") as f:
counter = 0
for line in f:
print line
counter += 1
if counter == N: break
# Method 2:
with open("fileName", "r") as f:
for i in xrange(N):
line = f.next()
print line
The bottom line is, as long as you don't use readlines() or enumerateing the whole file into memory, you have plenty of options.
Based on gnibbler top voted answer (Nov 20 '09 at 0:27): this class add head() and tail() method to file object.
class File(file):
def head(self, lines_2find=1):
self.seek(0) #Rewind file
return [self.next() for x in xrange(lines_2find)]
def tail(self, lines_2find=1):
self.seek(0, 2) #go to end of file
bytes_in_file = self.tell()
lines_found, total_bytes_scanned = 0, 0
while (lines_2find+1 > lines_found and
bytes_in_file > total_bytes_scanned):
byte_block = min(1024, bytes_in_file-total_bytes_scanned)
self.seek(-(byte_block+total_bytes_scanned), 2)
total_bytes_scanned += byte_block
lines_found += self.read(1024).count('\n')
self.seek(-total_bytes_scanned, 2)
line_list = list(self.readlines())
return line_list[-lines_2find:]
Usage:
f = File('path/to/file', 'r')
f.head(3)
f.tail(3)
most convinient way on my own:
LINE_COUNT = 3
print [s for (i, s) in enumerate(open('test.txt')) if i < LINE_COUNT]
Solution based on List Comprehension
The function open() supports an iteration interface. The enumerate() covers open() and return tuples (index, item), then we check that we're inside an accepted range (if i < LINE_COUNT) and then simply print the result.
Enjoy the Python. ;)
For first 5 lines, simply do:
N=5
with open("data_file", "r") as file:
for i in range(N):
print file.next()
If you want something that obviously (without looking up esoteric stuff in manuals) works without imports and try/except and works on a fair range of Python 2.x versions (2.2 to 2.6):
def headn(file_name, n):
"""Like *x head -N command"""
result = []
nlines = 0
assert n >= 1
for line in open(file_name):
result.append(line)
nlines += 1
if nlines >= n:
break
return result
if __name__ == "__main__":
import sys
rval = headn(sys.argv[1], int(sys.argv[2]))
print rval
print len(rval)
If you have a really big file, and assuming you want the output to be a numpy array, using np.genfromtxt will freeze your computer. This is so much better in my experience:
def load_big_file(fname,maxrows):
'''only works for well-formed text file of space-separated doubles'''
rows = [] # unknown number of lines, so use list
with open(fname) as f:
j=0
for line in f:
if j==maxrows:
break
else:
line = [float(s) for s in line.split()]
rows.append(np.array(line, dtype = np.double))
j+=1
return np.vstack(rows) # convert list of vectors to array
This worked for me
f = open("history_export.csv", "r")
line= 5
for x in range(line):
a = f.readline()
print(a)
I would like to handle the file with less than n-lines by reading the whole file
def head(filename: str, n: int):
try:
with open(filename) as f:
head_lines = [next(f).rstrip() for x in range(n)]
except StopIteration:
with open(filename) as f:
head_lines = f.read().splitlines()
return head_lines
Credit go to John La Rooy and Ilian Iliev. Use the function for the best performance with exception handle
Revise 1: Thanks FrankM for the feedback, to handle file existence and read permission we can futher add
import errno
import os
def head(filename: str, n: int):
if not os.path.isfile(filename):
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), filename)
if not os.access(filename, os.R_OK):
raise PermissionError(errno.EACCES, os.strerror(errno.EACCES), filename)
try:
with open(filename) as f:
head_lines = [next(f).rstrip() for x in range(n)]
except StopIteration:
with open(filename) as f:
head_lines = f.read().splitlines()
return head_lines
You can either go with second version or go with the first one and handle the file exception later. The check is quick and mostly free from performance standpoint
Starting at Python 2.6, you can take advantage of more sophisticated functions in the IO base clase. So the top rated answer above can be rewritten as:
with open("datafile") as myfile:
head = myfile.readlines(N)
print head
(You don't have to worry about your file having less than N lines since no StopIteration exception is thrown.)
This works for Python 2 & 3:
from itertools import islice
with open('/tmp/filename.txt') as inf:
for line in islice(inf, N, N+M):
print(line)
fname = input("Enter file name: ")
num_lines = 0
with open(fname, 'r') as f: #lines count
for line in f:
num_lines += 1
num_lines_input = int (input("Enter line numbers: "))
if num_lines_input <= num_lines:
f = open(fname, "r")
for x in range(num_lines_input):
a = f.readline()
print(a)
else:
f = open(fname, "r")
for x in range(num_lines_input):
a = f.readline()
print(a)
print("Don't have", num_lines_input, " lines print as much as you can")
print("Total lines in the text",num_lines)
Here's another decent solution with a list comprehension:
file = open('file.txt', 'r')
lines = [next(file) for x in range(3)] # first 3 lines will be in this list
file.close()
An easy way to get first 10 lines:
with open('fileName.txt', mode = 'r') as file:
list = [line.rstrip('\n') for line in file][:10]
print(list)
#!/usr/bin/python
import subprocess
p = subprocess.Popen(["tail", "-n 3", "passlist"], stdout=subprocess.PIPE)
output, err = p.communicate()
print output
This Method Worked for me
Simply Convert your CSV file object to a list using list(file_data)
import csv;
with open('your_csv_file.csv') as file_obj:
file_data = csv.reader(file_obj);
file_list = list(file_data)
for row in file_list[:4]:
print(row)

Categories