Python read and write a file faster - python

Here is my code for reading a huge file (more than 15 GiB) called interactions.csv and do some checks about each row and based on the check, split the interactions file into two separate files: test.csv and trains.csv.
It takes more than two days on my machine to stop. Is there any way I can make this code faster maybe using some kind of parallelism ?
target_items: a list containing some item IDs
The current program:
with open(interactions) as interactionFile, open("train.csv", "wb") as train, open("test.csv", "wb") as test:
header=interactionFile.next();
train.write(header+'\n')
test.write(header+'\n')
i=0
for row in interactionFile:
# process each row
l = row.split('\t')
if l[1] in target_items:
test.write(row+'\n')
else:
train.write(row+'\n')
print(i)
i+=1

Related

How can I split csv files in python?

Because of the memory error, i have to split my csv files. I did research it. I found it from one of the stack overflow user who is Aziz Alto. This is his code.
csvfile = open('#', 'r').readlines()
filename = 1
for i in range(len(csvfile)):
if i % 10000000 == 0:
open(str(filename) + '.csv', 'w+').writelines(csvfile[i:i+10000000])
filename += 1
It works well but for second file, the code did not add header which is very important for me. My question is that How can I add header for second file?
import pandas as pd
rows = pd.read_csv("csvfile.csv", chunksize=5000000)
for i, chuck in enumerate(rows):
chuck.to_csv('out{}.csv'.format(i)) # i is for chunk number of each iteration
chucksize you specify how many rows you want- in excel you can have upto 1,048,576 rows.
This will save it as 5000000 and with header.
hope this Helps!!
On the 2nd till last file you have to always add the 1st line of your original file (the one containing the header):
# this loads the first file fully into memory
with open('#', 'r') as f:
csvfile = f.readlines()
linesPerFile = 1000000
filename = 1
# this is better then your former loop, it loops in 1000000 lines a peice,
# instead of incrementing 1000000 times and only write on the millionth one
for i in range(0,len(csvfile),linesPerFile):
with open(str(filename) + '.csv', 'w+') as f:
if filename > 1: # this is the second or later file, we need to write the
f.write(csvfile[0]) # header again if 2nd.... file
f.writelines(csvfile[i:i+linesPerFile])
filename += 1
Fast csv file splitting
If you have a very big file and you have to try different partitions (say to find the best way to split it) the above solutions are too slow to try.
Another way to solve this (and a very fast one) is to create an index file by record number. It takes about six minutes to create an index file of a csv file of 6867839 rows and 9 Gb, and an additional 2 minutes for joblib to store it on disk.
This method is particularly impressive if you are dealing with huge files, like 3 Gb or more.
Here's the code for creating the index file:
# Usage:
# creaidx.py filename.csv
# indexes a csv file by record number. This can be used to
# access any record directly or to split a file without the
# need of reading it all. The index file is joblib-stored as
# filename.index
# filename.csv is the file to create index for
import os,sys,joblib
BLKSIZE=512
def checkopen(s,m='r',bz=None):
if os.access(s,os.F_OK):
if bz==None:
return open(s,m) # returns open file
else:
return open(s,m,bz) # returns open file with buffer size
else:
return None
def get_blk():
global ix,off,blk,buff
while True: # dealing with special cases
if ix==0:
n=0
break
if buff[0]==b'\r':
n=2
off=0
break
if off==BLKSIZE-2:
n=0
off=0
break
if off==BLKSIZE-1:
n=0
off=1
break
n=2
off=buff.find(b'\r')
break
while (off>=0 and off<BLKSIZE-2):
idx.append([ix,blk,off+n])
# g.write('{},{},{}\n'.format(ix,blk,off+n))
print(ix,end='\r')
n=2
ix+=1
off= buff.find(b'\r',off+2)
def crea_idx():
global buff,blk
buff=f.read(BLKSIZE)
while len(buff)==BLKSIZE:
get_blk()
buff=f.read(BLKSIZE)
blk+=1
get_blk()
idx[-1][2]=-1
return
if len(sys.argv)==1:
sys.exit("Need to provide a csv filename!")
ix=0
blk=0
off=0
idx=[]
buff=b'0'
s=sys.argv[1]
f=checkopen(s,'rb')
idxfile=s.replace('.csv','.index')
if checkopen(idxfile)==None:
with open(idxfile,'w') as g:
crea_idx()
joblib.dump(idx,idxfile)
else:
if os.path.getctime(idxfile)<os.path.getctime(s):
with open(idxfile,'w') as g:
crea_idx()
joblib.dump(idx,idxfile)
f.close()
Let's use a toy example:
strings,numbers,colors
string1,1,blue
string2,2,red
string3,3,green
string4,4,yellow
The index file will be:
[[0, 0, 0],
[1, 0, 24],
[2, 0, 40],
[3, 0, 55],
[4, 0, 72],
[5, 0, -1]]
Note the -1 at the last index element to indicate end of index file in case of a sequential access. You can use a tool like this to access any individual row of the csv file:
def get_rec(n=1,binary=False):
n=1 if n<0 else n+1
s=b'' if binary else ''
if len(idx)==0:return ''
if idx[n-1][2]==-1:return ''
f.seek(idx[n-1][1]*BLKSIZE+idx[n-1][2])
buff=f.read(BLKSIZE)
x=buff.find(b'\r')
while x==-1:
s=s+buff if binary else s+buff.decode()
buff=f.read(BLKSIZE)
x=buff.find(b'\r')
return s+buff[:x]+b'\r\n' if binary else s+buff[:x].decode()
The first field of the index record is obviously unnecessary. It is kept there for debugging purposes. As a side note, if you substitute this field by any field in the csv record and you sort the index file by that field, then you have the csv file sorted by that field if you use the index field to access the csv file.
Now, once you have you index file created you just call the following program with the filename (the one which index was created already) and a number between 1 and 100 which will be the percentage the file will be split at as command line parameters:
start_time = time.time()
BLKSIZE=512
WSIZE=1048576 # pow(2,20) 1Mb for faster reading/writing
import sys
import joblib
from common import Drv,checkopen
ix=0
blk=0
off=0
idx=[]
buff=b'0'
if len(sys.argv)<3:
sys.exit('Argument missing!')
s=Drv+sys.argv[1]
if sys.argv[2].isnumeric():
pct=int(sys.argv[2])/100
else:
sys.exit('Bad percentage: '+sys.argv[2])
f=checkopen(s,'rb')
idxfile=s.replace('.csv','.index')
if checkopen(idxfile):
print('Loading index...')
idx=joblib.load(idxfile)
print('Done loading index.')
else:
sys.exit(idxfile+' does not exist.')
head=get_rec(0,True)
n=int(pct*(len(idx)-2))
off=idx[n+1][1]*BLKSIZE+idx[n+1][2]-len(head)-1
num=off//WSIZE
res=off%WSIZE
sout=s.replace('.csv','.part1.csv')
i=0
with open(sout,'wb') as g:
g.write(head)
f.seek(idx[1][1]*BLKSIZE+idx[1][2])
for x in range(num):
print(i,end='\r')
i+=1
buff=f.read(WSIZE)
g.write(buff)
buff=f.read(res)
g.write(buff)
print()
i=0
sout=s.replace('.csv','.part2.csv')
with open(sout,'wb') as g:
g.write(head)
f.seek(idx[n+1][1]*BLKSIZE+idx[n+1][2])
buff=f.read(WSIZE)
while len(buff)==WSIZE:
g.write(buff)
print(i,end='\r')
i+=1
buff=f.read(WSIZE)
g.write(buff)
end_time = time.time()
The file are created using blocks of 1048576 bytes. You can play with that figure to make file creation faster or to tailor it to machines with less memory resources.
The file is split only on two partitions, each of them having the header of the original file. It is not too difficult to change the code to make it
split files into more than two partitions.
Finally to put things in perspective, to split a csv file of 6867839 rows and 9 Gb by 50%, it took me roughly 6 minutes to create the index file and another 2 minutes for joblib to store it on disk. It took 3 additional minutes to split the file.

A biopython script to split a large fasta file into multiple ones

I am working on a large fasta file I want to spliting into multiple ones according to the gene id. I am trying to use the above script from biopython tutorials:
def batch_iterator(iterator, batch_size):
"""Returns lists of length batch_size.
This can be used on any iterator, for example to batch up
SeqRecord objects from Bio.SeqIO.parse(...), or to batch
Alignment objects from Bio.AlignIO.parse(...), or simply
lines from a file handle.
This is a generator function, and it returns lists of the
entries from the supplied iterator. Each list will have
batch_size entries, although the final list may be shorter.
"""
entry = True # Make sure we loop once
while entry:
batch = []
while len(batch) < batch_size:
try:
entry = iterator.next()
except StopIteration:
entry = None
if entry is None:
# End of file
break
batch.append(entry)
if batch:
yield batch
record_iter=SeqIO.parse(open('/path/sorted_sequences.fa'), 'fasta')
for i, batch in enumerate (batch_iterator(record_iter, 93)):
filename='gene_%i.fasta' % (i + 1)
with open('/path/files/' + filename, 'w') as ouput_handle:
count=SeqIO.write(batch, ouput_handle, 'fasta')
print ('Wrote %i records to %s' % (count, filename))
It does split the files with 93 sequence in them but it gives 2 files per group of 93. I cannot see the error but I guess there is one.
There is another way I could split the large fasta file in a different way?
Thanks
After reading the code in the example, the iterator does not seem to separate files per gene id but just make a divition of the sequences in groups of batch_size, so in your case 93 sequences per file.
In case there is anyone interested in this script in the future. The script works perfectly the way it is. The problems was that the file I was trying to divide had more sequences than it should. So I deleted the bad file, and produce a new one that split nicely with the above script.

Python-comparing mutliple files from different folders and generating diff files

i want to automate below scenario in python
Actual-
cc0023-base.txt
cc9038.final.txt
Expected:
base.txt
final.txt
1"Actual" and "Expected" are two different folders under same directory.i want to compare "base" and "final" files of both folders and generate the diff file in another folder.
Diff:
base-diff.txt
final-diff.txt
how do i do it in python. below is the sample code which i have written,but its generating diff files of all possible combinations.I need that base should be compared only with base and final with final of both folders.
expected_files=os.listdir('expected/path')
actual_files = os.listdir('actual/path')
diff_files=os.listdir('diff/path')
cr=['base.txt','final.txt']
i=0
for files in expected_files:
tst=os.path.join('expected/path',files)
with open(tst,'r')as Expected:
for actualfile in actual_files:
actualpath=os.path.join('actual/path',actualfile)
with open(actualpath,'r') as actual:
diff=difflib.unified_diff(Expected.readlines(),
actual.readlines(),
fromfile=Expected,
tofile=actual,)
diffpath=os.path.join('diff/path',cr[i])
diff_file = open(diffpath, 'w')
for line in diff:
diff_file.write(line)
diff_file.close()
i=i+1
Please help,as i am new to python
The issue in your code is in this section:
i=0
diffpath=os.path.join('diff/path',cr[i])
diff_file = open(diffpath, 'w')
for line in diff:
diff_file.write(line)
diff_file.close()
i=i+1
Since you are always setting i to 0 before accessing cr[i] it will always be cr[0]
move the i=0 to before the loop starts that you want to initialize the value to 0.
I think you want something like this:
expected_files=os.listdir('expected/path')
actual_files = os.listdir('actual/path')
diff_files=os.listdir('diff/path')
cr=['base.txt','final.txt']
j=1
for files in expected_files:
tst=os.path.join('expected/path',files)
with open(tst,'r')as Expected:
#i=0
for i, actualfile in enumerate(actual_files):
actualpath=os.path.join('actual/path',actualfile)
with open(actualpath,'r') as actual:
diff=difflib.unified_diff(Expected.readlines(),
actual.readlines(),
fromfile=Expected,
tofile=actual,)
diffpath=os.path.join('diff/path',cr[i])
with open(diffpath, 'w') as diff_file:
for line in diff:
diff_file.write(line)
#diff_file.close()
#i=i+1
Some explanation, so the enumerate(actual_files) will give you an index i and the data from the list actualfile this way you don't have to do the incrementing yourself. (Also worth noting that this will break for more than 2 files in your directory!) Also, you can use with open() as foo: syntax for writes as shown.

Python: performance issues with islice

With the following code, I'm seeing longer and longer execution times as I increase the starting row in islice. For example, a start_row of 4 will execute in 1s but a start_row of 500004 will take 11s. Why does this happen and is there a faster way to do this? I want to be able to iterate over several ranges of rows in a large CSV file (several GB) and make some calculations.
import csv
import itertools
from collections import deque
import time
my_queue = deque()
start_row = 500004
stop_row = start_row + 50000
with open('test.csv', 'rb') as fin:
#load into csv's reader
csv_f = csv.reader(fin)
#start logging time for performance
start = time.time()
for row in itertools.islice(csv_f, start_row, stop_row):
my_queue.append(float(row[4])*float(row[10]))
#stop logging time
end = time.time()
#display performance
print "Initial queue populating time: %.2f" % (end-start)
For example, a start_row of 4 will execute in 1s but a start_row of
500004 will take 11s
That is islice being intelligent. Or lazy, depending on which term you prefer.
Thing is, files are "just" strings of bytes on your hard drive. They don't have any internal organization. \n is just another set of bytes in that long, long string. There is no way to access any particular line without looking at all of the information before it (unless your lines are of the exact same length, in which case you can use file.seek).
Line 4? Finding line 4 is fast, your computer just needs to find 3 \n. Line 50004? Your computer has to read through the file until it finds 500003 \n. No way around it, and if someone tells you otherwise, they either have some other sort of quantum computer or their computer is reading through the file just like every other computer in the world, just behind their back.
As for what you can do about it: Try to be smart when trying to grab lines to iterate over. Smart, and lazy. Arrange your requests so you're only iterating through the file once, and close the file as soon as you've pulled the data you need. (islice does all of this, by the way.)
In python
lines_I_want = [(start1, stop1), (start2, stop2),...]
with f as open(filename):
for i,j in enumerate(f):
if i >= lines_I_want[0][0]:
if i >= lines_I_want[0][1]:
lines_I_want.pop(0)
if not lines_I_want: #list is empty
break
else:
#j is a line I want. Do something
And if you have any control over making that file, make every line the same length so you can seek. Or use a database.
The problem with using islice() for what you're doing is that iterates through all the lines before the first one you want before returning anything. Obviously the larger the starting row, the longer this will take. Another is that you're using a csv.reader to read these lines, which incurs likely unnecessary overhead since one line of the csv file is often one row of it. The only time that's not true is when the csv file has string fields in it that contain embedded newline characters — which in my experience is uncommon.
If this is a valid assumption for your data, it would likely be much faster to first index the file and build a table of (filename, offset, number-of-rows) tuples indicating the approximately equally-sized logical chunks of lines/rows in the file. With that, you can process them relatively quickly by first seeking to the starting offset and then reading the specified number of csv rows from that point on.
Another advantage to this approach is it would allow you to process the chunks in parallel, which I suspect is is the real problem you're trying to solve based on a previous question of yours. So, even though you haven't mentioned multiprocessing here, this following has been written to be compatible with doing that, if that's the case.
import csv
from itertools import islice
import os
import sys
def open_binary_mode(filename, mode='r'):
""" Open a file proper way (depends on Python verion). """
kwargs = (dict(mode=mode+'b') if sys.version_info[0] == 2 else
dict(mode=mode, newline=''))
return open(filename, **kwargs)
def split(infilename, num_chunks):
infile_size = os.path.getsize(infilename)
chunk_size = infile_size // num_chunks
offset = 0
num_rows = 0
bytes_read = 0
chunks = []
with open_binary_mode(infilename, 'r') as infile:
for _ in range(num_chunks):
while bytes_read < chunk_size:
try:
bytes_read += len(next(infile))
num_rows += 1
except StopIteration: # end of infile
break
chunks.append((infilename, offset, num_rows))
offset += bytes_read
num_rows = 0
bytes_read = 0
return chunks
chunks = split('sample_simple.csv', num_chunks=4)
for filename, offset, rows in chunks:
print('processing: {} rows starting at offset {}'.format(rows, offset))
with open_binary_mode(filename, 'r') as fin:
fin.seek(offset)
for row in islice(csv.reader(fin), rows):
print(row)

Upper memory limit?

Is there a limit to memory for python? I've been using a python script to calculate the average values from a file which is a minimum of 150mb big.
Depending on the size of the file I sometimes encounter a MemoryError.
Can more memory be assigned to the python so I don't encounter the error?
EDIT: Code now below
NOTE: The file sizes can vary greatly (up to 20GB) the minimum size of the a file is 150mb
file_A1_B1 = open("A1_B1_100000.txt", "r")
file_A2_B2 = open("A2_B2_100000.txt", "r")
file_A1_B2 = open("A1_B2_100000.txt", "r")
file_A2_B1 = open("A2_B1_100000.txt", "r")
file_write = open ("average_generations.txt", "w")
mutation_average = open("mutation_average", "w")
files = [file_A2_B2,file_A2_B2,file_A1_B2,file_A2_B1]
for u in files:
line = u.readlines()
list_of_lines = []
for i in line:
values = i.split('\t')
list_of_lines.append(values)
count = 0
for j in list_of_lines:
count +=1
for k in range(0,count):
list_of_lines[k].remove('\n')
length = len(list_of_lines[0])
print_counter = 4
for o in range(0,length):
total = 0
for p in range(0,count):
number = float(list_of_lines[p][o])
total = total + number
average = total/count
print average
if print_counter == 4:
file_write.write(str(average)+'\n')
print_counter = 0
print_counter +=1
file_write.write('\n')
(This is my third answer because I misunderstood what your code was doing in my original, and then made a small but crucial mistake in my second—hopefully three's a charm.
Edits: Since this seems to be a popular answer, I've made a few modifications to improve its implementation over the years—most not too major. This is so if folks use it as template, it will provide an even better basis.
As others have pointed out, your MemoryError problem is most likely because you're attempting to read the entire contents of huge files into memory and then, on top of that, effectively doubling the amount of memory needed by creating a list of lists of the string values from each line.
Python's memory limits are determined by how much physical ram and virtual memory disk space your computer and operating system have available. Even if you don't use it all up and your program "works", using it may be impractical because it takes too long.
Anyway, the most obvious way to avoid that is to process each file a single line at a time, which means you have to do the processing incrementally.
To accomplish this, a list of running totals for each of the fields is kept. When that is finished, the average value of each field can be calculated by dividing the corresponding total value by the count of total lines read. Once that is done, these averages can be printed out and some written to one of the output files. I've also made a conscious effort to use very descriptive variable names to try to make it understandable.
try:
from itertools import izip_longest
except ImportError: # Python 3
from itertools import zip_longest as izip_longest
GROUP_SIZE = 4
input_file_names = ["A1_B1_100000.txt", "A2_B2_100000.txt", "A1_B2_100000.txt",
"A2_B1_100000.txt"]
file_write = open("average_generations.txt", 'w')
mutation_average = open("mutation_average", 'w') # left in, but nothing written
for file_name in input_file_names:
with open(file_name, 'r') as input_file:
print('processing file: {}'.format(file_name))
totals = []
for count, fields in enumerate((line.split('\t') for line in input_file), 1):
totals = [sum(values) for values in
izip_longest(totals, map(float, fields), fillvalue=0)]
averages = [total/count for total in totals]
for print_counter, average in enumerate(averages):
print(' {:9.4f}'.format(average))
if print_counter % GROUP_SIZE == 0:
file_write.write(str(average)+'\n')
file_write.write('\n')
file_write.close()
mutation_average.close()
You're reading the entire file into memory (line = u.readlines()) which will fail of course if the file is too large (and you say that some are up to 20 GB), so that's your problem right there.
Better iterate over each line:
for current_line in u:
do_something_with(current_line)
is the recommended approach.
Later in your script, you're doing some very strange things like first counting all the items in a list, then constructing a for loop over the range of that count. Why not iterate over the list directly? What is the purpose of your script? I have the impression that this could be done much easier.
This is one of the advantages of high-level languages like Python (as opposed to C where you do have to do these housekeeping tasks yourself): Allow Python to handle iteration for you, and only collect in memory what you actually need to have in memory at any given time.
Also, as it seems that you're processing TSV files (tabulator-separated values), you should take a look at the csv module which will handle all the splitting, removing of \ns etc. for you.
Python can use all memory available to its environment. My simple "memory test" crashes on ActiveState Python 2.6 after using about
1959167 [MiB]
On jython 2.5 it crashes earlier:
239000 [MiB]
probably I can configure Jython to use more memory (it uses limits from JVM)
Test app:
import sys
sl = []
i = 0
# some magic 1024 - overhead of string object
fill_size = 1024
if sys.version.startswith('2.7'):
fill_size = 1003
if sys.version.startswith('3'):
fill_size = 497
print(fill_size)
MiB = 0
while True:
s = str(i).zfill(fill_size)
sl.append(s)
if i == 0:
try:
sys.stderr.write('size of one string %d\n' % (sys.getsizeof(s)))
except AttributeError:
pass
i += 1
if i % 1024 == 0:
MiB += 1
if MiB % 25 == 0:
sys.stderr.write('%d [MiB]\n' % (MiB))
In your app you read whole file at once. For such big files you should read the line by line.
No, there's no Python-specific limit on the memory usage of a Python application. I regularly work with Python applications that may use several gigabytes of memory. Most likely, your script actually uses more memory than available on the machine you're running on.
In that case, the solution is to rewrite the script to be more memory efficient, or to add more physical memory if the script is already optimized to minimize memory usage.
Edit:
Your script reads the entire contents of your files into memory at once (line = u.readlines()). Since you're processing files up to 20 GB in size, you're going to get memory errors with that approach unless you have huge amounts of memory in your machine.
A better approach would be to read the files one line at a time:
for u in files:
for line in u: # This will iterate over each line in the file
# Read values from the line, do necessary calculations
Not only are you reading the whole of each file into memory, but also you laboriously replicate the information in a table called list_of_lines.
You have a secondary problem: your choices of variable names severely obfuscate what you are doing.
Here is your script rewritten with the readlines() caper removed and with meaningful names:
file_A1_B1 = open("A1_B1_100000.txt", "r")
file_A2_B2 = open("A2_B2_100000.txt", "r")
file_A1_B2 = open("A1_B2_100000.txt", "r")
file_A2_B1 = open("A2_B1_100000.txt", "r")
file_write = open ("average_generations.txt", "w")
mutation_average = open("mutation_average", "w") # not used
files = [file_A2_B2,file_A2_B2,file_A1_B2,file_A2_B1]
for afile in files:
table = []
for aline in afile:
values = aline.split('\t')
values.remove('\n') # why?
table.append(values)
row_count = len(table)
row0length = len(table[0])
print_counter = 4
for column_index in range(row0length):
column_total = 0
for row_index in range(row_count):
number = float(table[row_index][column_index])
column_total = column_total + number
column_average = column_total/row_count
print column_average
if print_counter == 4:
file_write.write(str(column_average)+'\n')
print_counter = 0
print_counter +=1
file_write.write('\n')
It rapidly becomes apparent that (1) you are calculating column averages (2) the obfuscation led some others to think you were calculating row averages.
As you are calculating column averages, no output is required until the end of each file, and the amount of extra memory actually required is proportional to the number of columns.
Here is a revised version of the outer loop code:
for afile in files:
for row_count, aline in enumerate(afile, start=1):
values = aline.split('\t')
values.remove('\n') # why?
fvalues = map(float, values)
if row_count == 1:
row0length = len(fvalues)
column_index_range = range(row0length)
column_totals = fvalues
else:
assert len(fvalues) == row0length
for column_index in column_index_range:
column_totals[column_index] += fvalues[column_index]
print_counter = 4
for column_index in column_index_range:
column_average = column_totals[column_index] / row_count
print column_average
if print_counter == 4:
file_write.write(str(column_average)+'\n')
print_counter = 0
print_counter +=1

Categories