I have a large txt that contains about 100.000.000 rows (I cannot read it to the memory as a whole). I would like to read n-th row efficiently. I found this How can I get python to read every nth line of a .txt file? and I constructed this function:
from itertools import islice
def read_n_line(file: str, n: int, encoding='utf-8') -> str:
with open(file, encoding=encoding) as f:
return next(islice(f, n - 1, n))
The problem is that my function is fast (0.5 seconds) for n = 1000, but slow (15 seconds) for n = 10.000.000. Can I somehow improve my function to be fast for all n, please?

For sufficiently large files, it may be more efficient to use a Numba-based approach:
import numba as nb
def process(
nl = ord("\n")
for i, c in enumerate(block, offset):
if c == nl:
found = True
last_nl_pos = nl_pos
nl_pos = i
nl_count += 1
if nl_count == n:
return last_nl_pos, nl_pos, nl_count
def read_nth_line_nb(
filepath: str,
n: int,
size=2 ** 22, # 4 MiB
) -> str:
with open(filepath, "rb") as file_obj:
last_nl_pos = nl_pos = -1
nl_count = -1
offset = 0
while True:
block =
if block:
last_nl_pos, nl_pos, nl_count = process(
block, n, last_nl_pos, nl_pos, nl_count, offset
offset += size
if nl_count == n: + 1)
return - last_nl_pos).decode(encoding)
This essentially processes the file in blocks, keeping tracks of how many and where the new lines are (and how far is the block on the file).
For comparison I use the itertools.islice() approach:
import itertools
def read_nth_line_isl(filepath: str, n: int, encoding="utf-8") -> str:
with open(filepath, "r", encoding=encoding) as file_obj:
return next(itertools.islice(file_obj, n, n + 1), None)
as well as the naïve looping:
def read_nth_line_loop(filepath: str, n: int, encoding="utf-8") -> str:
with open(filepath, "r", encoding=encoding) as file_obj:
for i, line in enumerate(file_obj):
if i == n:
return line
return None
Assuming some files were created with the following:
import random
import string
def write_some_file(filepath: str, n: int, m: int = 10, l: int = 100, encoding="utf-8") -> None:
with open(filepath, "w", encoding=encoding) as file_obj:
for i in range(n):
line = "".join(random.choices(string.ascii_letters, k=random.randrange(m, l)))
file_obj.write(f"{i:0{k}d} - {line}\n")
k = 9
for i in range(1, k):
n_max = 10 ** i
write_some_file(f"test{n_max:0{k}d}.txt", n_max)
It is possible to test that they all give the same result:
funcs = read_nth_line_isl, read_nth_line_loop, read_nth_line_nb
k = 9
n_max = 10 ** 5
filename = f"test{n_max:0{k}d}.txt"
for func in funcs:
print(f"{func.__name__:>20} {func(filename, n_max - 1)!r}")
# read_nth_line_isl '000099999 - sWBnniKkpROZYlqfFLbSttEwYCjXfhQSapkkqxjePpGbobJzgaJTCOCSyHQEcLppZ\n'
# read_nth_line_loop '000099999 - sWBnniKkpROZYlqfFLbSttEwYCjXfhQSapkkqxjePpGbobJzgaJTCOCSyHQEcLppZ\n'
# read_nth_line_nb '000099999 - sWBnniKkpROZYlqfFLbSttEwYCjXfhQSapkkqxjePpGbobJzgaJTCOCSyHQEcLppZ\n'
The timings can be computed with:
k = 9
timings = {}
for i in range(1, k - 1):
n_max = 10 ** i
filename = f"test{n_max:0{k}d}.txt"
timings[i] = []
base = funcs[0](filename, n_max - 1)
for func in funcs:
res = func(filename, n_max - 1)
is_good = base == res
if i < 6:
timed = %timeit -r 12 -n 12 -q -o func(filename, n_max - 1)
timed = %timeit -r 1 -n 1 -q -o func(filename, n_max - 1)
timing = * 1e3
timings[i].append(timing if is_good else None)
print(f"{func.__name__:>24} {is_good!s:5} {timing:10.3f} ms")
and plotted with:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(data=timings, index=[func.__name__ for func in funcs]).transpose()
df.plot(marker='o', xlabel='log₁₀(Input Size) / #', ylabel='Best timing / µs', figsize=(6, 4), logy=True)
fig = plt.gcf()
to obtain:
Indicating the Numba-based approach to be marginally faster (some 5-15%) for sufficiently large inputs (above 10⁵).


My python program sorts information incorrectly

I have no idea how to fix this sorter part. Could anyone help me out here? My program sorter should output something like shown in the image:
It takes data from the url given and calculates the area of it. But it should also sort the information but it doesn't right now.
import json
import urllib.request
import requests
f = open('katastritunnused.txt', 'r')
response = requests.get('')
json_response = response.json()
This gets and calculates the area of the info got form the url.
c = []
list2 = []
def bubblesort(c):
n = len(c)
for _ in range(n):
jarjestatud = True
for x in range(n - _ - 1):
if c[x][1] > c[x + 1][1]:
c[x][1], c[x + 1][1] = c[x + 1][1], c[x][1]
jarjestatud = False
if jarjestatud:
return c
list2 = []
for j, _ in enumerate(c):
templist = [_, list2[j]]
desclist = bubblesort(list2)
def writer(a):
with open('sorteeritud.csv', mode = 'w',) as csv:
for b in a:
csv.write(str(b[0]) +';' + str(b[1])+'\n')

Python Code Speed Up

My code should compare two vectors saved as dictionary (two pickle files) and save the result into a pickle file too. This works but very slowly. For one compare result I'm waiting about 7:2o min. Because I have a lot of videos (exactly 2033) this prog will run about 10 days. This is too long. How can I speed up my code for Python 2.7?
import math
import csv
import pickle
from itertools import izip
global_ddc_file = 'E:/global_ddc.p'
io = 'E:/AV-Datensatz'
v_source = ''
def dot_product(v1, v2):
return sum(map(lambda x: x[0] * x[1], izip(v1, v2))) # izip('ABCD', 'xy') --> Ax By
def cosine_measure(v1, v2):
prod = dot_product(v1, v2)
len1 = math.sqrt(dot_product(v1, v1))
len2 = math.sqrt(dot_product(v2, v2))
if (len1 * len2) <> 0:
out = prod / (len1 * len2)
else: out = 0
return out
def findSource(v):
v_id = "/"+v[0].lstrip("<").rstrip(">")
v_source = io + v_id
v_file = v_source + '/vector.p'
source = [v_id, v_source, v_file]
return source
def getVector(v, vectorCol):
with open (v, 'rb') as f:
vector_v = pickle.load(f)
except: print 'file couldnt be loaded'
tf_idf = []
tf_idf = [vec[1][vectorCol] for vec in vector_v]
return tf_idf
def compareVectors(v1, v2, vectorCol):
v1_source = findSource(v1)
v2_source = findSource(v2)
V1 = getVector(v1_source[2], vectorCol)
V2 = getVector(v2_source[2], vectorCol)
sim = [v1_source[0], v2_source[0], cosine_measure(V1, V2)]
return sim
#with open('videos_av_portal_cc_3.0_nur2bspStanford.csv', 'rb') as dataIn:
with open('videos_av_portal_cc_3.0_vollstaendig.csv', 'rb') as dataIn:
#with open('videos_av_portal_cc_3.0.csv', 'rb') as dataIn:
reader = csv.reader(dataIn)
v_source = []
for row in reader:
#print v_source
for one in v_source:
print one[1]
compVec = []
for another in v_source:
if one <> another:
compVec.append(compareVectors(one, another, 3))
compVec_sort = sorted(compVec, key=lambda cosim: cosim[2], reverse = True)
# save vector file for each video
with open (one[1] + '/compare.p','wb') as f:
Split code in 2 parts:
1. Load Dictionary in vectors
2. Compare 2 dictionaries using multiprocessmultiprocess example
3. Launch process simultaneously according to memory availability and end the process after 8 mins. Then update the 3rd dictionary.
4. Then relaunch process on next set of data , follow step 3 and continue till the dictionary length.
This should reduce total turnaround time.
Let me know if you need code .

Better examples of Parallel processing in Python

I hope I am not downvoted this time. I have been struggling with parallel processing in Python for a while(2 days , exactly). I have checking these resources(a partial list is shown here:
I came unstuck. What I want to do is this:
Break up the file into chunks(strings or numbers)
Broadcast a pattern to be searched to all the workers
Receive the offsets in the file where the pattern was found
Receive pattern and chunk of text from the master
Send back the offsets to the master.
I tried to implement this using MPI/concurrent.futures/multiprocessing and came unstuck.
My naive implementation using multiprocessing module
import multiprocessing
filename = "file1.txt"
pat = "afow"
N = 1000
""" This is the naive string search algorithm"""
def search(pat, txt):
patLen = len(pat)
txtLen = len(txt)
offsets = []
# A loop to slide pattern[] one by one
# Range generates numbers up to but not including that number
for i in range ((txtLen - patLen) + 1):
# Can not use a for loop here
# For loops in C with && statements must be
# converted to while statements in python
counter = 0
while(counter < patLen) and pat[counter] == txt[counter + i]:
counter += 1
if counter >= patLen:
return str(offsets).strip('[]')
This is what I want
if __name__ == "__main__":
tasks = []
pool_outputs = []
pool = multiprocessing.Pool(processes=5)
with open(filename, 'r') as infile:
lines = []
for line in infile:
if len(lines) > N:
pool_output =, tasks)
lines = []
if len(lines) > 0:
pool_output =, tasks)
print('Pool:', pool_outputs)
with open(filename, 'r') as infile:
for line in infile:
print(search(pat, line))
I would be grateful for any guidance especially with the concurrent.futures. Thanks for your time. Valeriy helped me with his addition and I thank him for that.
But if anyone could just indulge me for a moment, this is the code I was working on for the concurrent.futures(working off an example I saw somewhere)
from concurrent.futures import ProcessPoolExecutor, as_completed
import math
def search(pat, txt):
patLen = len(pat)
txtLen = len(txt)
offsets = []
# A loop to slide pattern[] one by one
# Range generates numbers up to but not including that number
for i in range ((txtLen - patLen) + 1):
# Can not use a for loop here
# For loops in C with && statements must be
# converted to while statements in python
counter = 0
while(counter < patLen) and pat[counter] == txt[counter + i]:
counter += 1
if counter >= patLen:
return str(offsets).strip('[]')
#Check a list of strings
def chunked_worker(lines):
return {0: search("fmo", line) for line in lines}
def pool_bruteforce(filename, nprocs):
lines = []
with open(filename) as f:
lines = [line.rstrip('\n') for line in f]
chunksize = int(math.ceil(len(lines) / float(nprocs)))
futures = []
with ProcessPoolExecutor() as executor:
for i in range(nprocs):
chunk = lines[(chunksize * i): (chunksize * (i + 1))]
futures.append(executor.submit(chunked_worker, chunk))
resultdict = {}
for f in as_completed(futures):
return resultdict
filename = "file1.txt"
pool_bruteforce(filename, 5)
Thanks again , Valeriy and anyone who attempts to help me solve my riddle.
You are using several arguments, so:
import multiprocessing
from functools import partial
filename = "file1.txt"
pat = "afow"
N = 1000
""" This is the naive string search algorithm"""
def search(pat, txt):
patLen = len(pat)
txtLen = len(txt)
offsets = []
# A loop to slide pattern[] one by one
# Range generates numbers up to but not including that number
for i in range ((txtLen - patLen) + 1):
# Can not use a for loop here
# For loops in C with && statements must be
# converted to while statements in python
counter = 0
while(counter < patLen) and pat[counter] == txt[counter + i]:
counter += 1
if counter >= patLen:
return str(offsets).strip('[]')
if __name__ == "__main__":
tasks = []
pool_outputs = []
pool = multiprocessing.Pool(processes=5)
lines = []
with open(filename, 'r') as infile:
for line in infile:
tasks = lines
func = partial(search, pat)
if len(lines) > N:
pool_output =, lines )
elif len(lines) > 0:
pool_output =, lines )
print('Pool:', pool_outputs)

Python 3 - Can pickle handle byte objects larger than 4GB?

Based on this comment and the referenced documentation, Pickle 4.0+ from Python 3.4+ should be able to pickle byte objects larger than 4 GB.
However, using python 3.4.3 or python 3.5.0b2 on Mac OS X 10.10.4, I get an error when I try to pickle a large byte array:
>>> import pickle
>>> x = bytearray(8 * 1000 * 1000 * 1000)
>>> fp = open("x.dat", "wb")
>>> pickle.dump(x, fp, protocol = 4)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
OSError: [Errno 22] Invalid argument
Is there a bug in my code or am I misunderstanding the documentation?
Here is a simple workaround for issue 24658. Use pickle.loads or pickle.dumps and break the bytes object into chunks of size 2**31 - 1 to get it in or out of the file.
import pickle
import os.path
file_path = "pkl.pkl"
n_bytes = 2**31
max_bytes = 2**31 - 1
data = bytearray(n_bytes)
## write
bytes_out = pickle.dumps(data)
with open(file_path, 'wb') as f_out:
for idx in range(0, len(bytes_out), max_bytes):
## read
bytes_in = bytearray(0)
input_size = os.path.getsize(file_path)
with open(file_path, 'rb') as f_in:
for _ in range(0, input_size, max_bytes):
bytes_in +=
data2 = pickle.loads(bytes_in)
assert(data == data2)
To sum up what was answered in the comments:
Yes, Python can pickle byte objects bigger than 4GB. The observed error is caused by a bug in the implementation (see Issue24658).
Here is the full workaround, though it seems pickle.load no longer tries to dump a huge file anymore (I am on Python 3.5.2) so strictly speaking only the pickle.dumps needs this to work properly.
import pickle
class MacOSFile(object):
def __init__(self, f):
self.f = f
def __getattr__(self, item):
return getattr(self.f, item)
def read(self, n):
# print("reading total_bytes=%s" % n, flush=True)
if n >= (1 << 31):
buffer = bytearray(n)
idx = 0
while idx < n:
batch_size = min(n - idx, 1 << 31 - 1)
# print("reading bytes [%s,%s)..." % (idx, idx + batch_size), end="", flush=True)
buffer[idx:idx + batch_size] =
# print("done.", flush=True)
idx += batch_size
return buffer
def write(self, buffer):
n = len(buffer)
print("writing total_bytes=%s..." % n, flush=True)
idx = 0
while idx < n:
batch_size = min(n - idx, 1 << 31 - 1)
print("writing bytes [%s, %s)... " % (idx, idx + batch_size), end="", flush=True)
self.f.write(buffer[idx:idx + batch_size])
print("done.", flush=True)
idx += batch_size
def pickle_dump(obj, file_path):
with open(file_path, "wb") as f:
return pickle.dump(obj, MacOSFile(f), protocol=pickle.HIGHEST_PROTOCOL)
def pickle_load(file_path):
with open(file_path, "rb") as f:
return pickle.load(MacOSFile(f))
You can specify the protocol for the dump.
If you do pickle.dump(obj,file,protocol=4) it should work.
Reading a file by 2GB chunks takes twice as much memory as needed if bytes concatenation is performed, my approach to loading pickles is based on bytearray:
class MacOSFile(object):
def __init__(self, f):
self.f = f
def __getattr__(self, item):
return getattr(self.f, item)
def read(self, n):
if n >= (1 << 31):
buffer = bytearray(n)
pos = 0
while pos < n:
size = min(n - pos, 1 << 31 - 1)
chunk =
buffer[pos:pos + size] = chunk
pos += size
return buffer
with open("/path", "rb") as fin:
obj = pickle.load(MacOSFile(fin))
Had the same issue and fixed it by upgrading to Python 3.6.8.
This seems to be the PR that did it:
I also found this issue, to solve this problem i chunk the code into several iteration. Let say in this case i have 50.000 data which i have to calc tf-idf and do knn classfication. When i run and directly iterate 50.000 it give me "that error". So, to solve this problem i chunk it.
tokenized_documents = self.load_tokenized_preprocessing_documents()
idf = self.load_idf_41227()
doc_length = len(documents)
for iteration in range(0, 9):
tfidf_documents = []
for index in range(iteration, 4000):
doc_tfidf = []
for term in idf.keys():
tf = self.term_frequency(term, tokenized_documents[index])
doc_tfidf.append(tf * idf[term])
doc = documents[index]
tfidf = [doc_tfidf, doc[0], doc[1]]
print("{} from {} document {}".format(index, doc_length, doc[0]))
self.save_tfidf_41227(tfidf_documents, iteration)

head, tail and backward read by lines of a text file

How to implement somethig like the 'head' and 'tail' commands in python and backward read by lines of a text file?
This is my personal file class ;-)
class File(file):
""" An helper class for file reading """
def __init__(self, *args, **kwargs):
super(File, self).__init__(*args, **kwargs)
self.BLOCKSIZE = 4096
def head(self, lines_2find=1): #Rewind file
return [super(File, self).next() for x in xrange(lines_2find)]
def tail(self, lines_2find=1):, 2) #Go to end of file
bytes_in_file = self.tell()
lines_found, total_bytes_scanned = 0, 0
while (lines_2find + 1 > lines_found and
bytes_in_file > total_bytes_scanned):
byte_block = min(
bytes_in_file - total_bytes_scanned) -(byte_block + total_bytes_scanned), 2)
total_bytes_scanned += byte_block
lines_found +='\n'), 2)
line_list = list(self.readlines())
return line_list[-lines_2find:]
def backward(self):, 2) #Go to end of file
blocksize = self.BLOCKSIZE
last_row = ''
while self.tell() != 0:
try:, 1)
except IOError:
blocksize = self.tell(), 1)
block =, 1)
rows = block.split('\n')
rows[-1] = rows[-1] + last_row
while rows:
last_row = rows.pop(-1)
if rows and last_row:
yield last_row
yield last_row
Example usage:
with File('') as f:
print f.head(5)
print f.tail(5)
for row in f.backward():
print row
head is easy:
from itertools import islice
with open("file") as f:
for line in islice(f, n):
print line
tail is harder if you don't want to keep the whole file in memory. If the input is a file, you could start reading blocks beginning at the end of the file. The original tail also works if the input is a pipe, so a more general solution is to read and discard the whole input, except for the last few lines. An easy way to do this is collections.deque:
from collections import deque
with open("file") as f:
for line in deque(f, maxlen=n):
print line
In both these code snippets, n is the number of lines to print.
def tail(fname, lines):
"""Read last N lines from file fname."""
f = open(fname, 'r')
BUFSIZ = 1024, os.SEEK_END)
fsize = f.tell()
block = -1
data = ""
exit = False
while not exit:
step = (block * BUFSIZ)
if abs(step) >= fsize:
exit = True
else:, os.SEEK_END)
data =
if data.count('\n') >= lines:
block -= 1
return data.splitlines()[-lines:]
