I have a large .csv file that is well over 300 gb. I would like to chunk it into smaller files of 100,000,000 rows each (each row has approximately 55-60 bytes).
I wrote the following code:
import pandas as pd
df = pd.read_csv('/path/to/really/big.csv',header=None,chunksize=100000000)
count = 1
for chunk in df:
name = '/output/to/this/directory/file_%s.csv' %s count
chunk.to_csv(name,header=None,index=None)
print(count)
count+=1
This code works fine, and I have plenty of memory on disk to store the approximate 5.5-6 gb at a time, but it's slow.
Is there a better way?
EDIT
I have written the following iterative solution:
with open('/path/to/really/big.csv', 'r') as csvfile:
read_rows = csv.reader(csvfile)
file_count = 1
row_count = 1
f = open('/output/to/this/directory/file_%s.csv' %s count,'w')
for row in read_rows:
f.write(''.join(row))
row_count+=1
if row_count % 100000000 == 0:
f.close()
file_count += 1
f = open('/output/to/this/directory/file_%s.csv' %s count,'w')
EDIT 2
I would like to call attention to Vor's comment about using a Unix/Linux split command, this is the fastest solution I have found.
there is an existing tool for this in Unix/Linux.
split -l 100000 -d source destination
will add two digit numerical suffix to destination prefix for the chunks.
You don't really need to read all that data into a pandas DataFrame just to split the file - you don't even need to read the data all into memory at all. You could seek to the approximate offset you want to split at, then scan forward until you find a line break, and loop reading much smaller chunks from the source file into a destination file between your start and end offsets. (This approach assumes your CSV doesn't have any column values with embedded newlines.)
SMALL_CHUNK = 100000
def write_chunk(source_file, start, end, dest_name):
pos = start
source_file.seek(pos)
with open(dest_name, 'w') as dest_file:
for chunk_start in range(start, end, SMALL_CHUNK):
chunk_end = min(chunk_start + SMALL_CHUNK, end)
dest_file.write(source_file.read(chunk_end - chunk_start))
Actually, an intermediate solution could be to use the csv module - that would still parse all of the lines in the file, which isn't strictly necessary, but would avoid reading huge arrays into memory for each chunk.
Related
I am trying to batch a very large text file (approximately 150 gigabytes) into several smaller text files (approximately 10 gigabytes).
My general process will be:
# iterate over file one line at a time
# accumulate batch as string
--> # given a certain count that correlates to the size of my current accumulated batch and when that size is met: (this is where I am unsure)
# write to file
# accumulate size count
I have a rough metric to calculate when to batch (when the desired batch size) but am not so clear how I should calculate how often to write to disk for a given batch. For example, if my batch size is 10 gigabytes, I assume I will need to iteratively write rather than hold the entire 10 gigbyte batch in memory. I obviously do not want to write more than I have to as this could be quite expensive.
Do ya'll have any rough calculations or tricks that you like to use to figure out when to write to disk for task such as this, e.g. size vs memory or something?
Assuming your large file is simple unstructured text, i.e. this is no good for structured text like JSON, here's an alternative to reading every single line: read large binary bites of the input file until at your chunksize then read a couple of lines, close the current output file and move on to the next.
I compared this with line-by-line using #tdelaney code adapted with the same chunksize as my code - that code took 250s to split a 12GiB input file into 6x2GiB chunks, whereas this took ~50s so maybe five times faster and looks like it's I/O bound on my SSD running >200MiB/s read and write, where the line-by-line was running 40-50MiB/s read and write.
I turned buffering off because there's not a lot of point. The size of bite and the buffering setting may be tuneable to improve performance, haven't tried any other settings as for me it seems to be I/O bound anyway.
import time
outfile_template = "outfile-{}.txt"
infile_name = "large.text"
chunksize = 2_000_000_000
MEB = 2**20 # mebibyte
bitesize = 4_000_000 # the size of the reads (and writes) working up to chunksize
count = 0
starttime = time.perf_counter()
infile = open(infile_name, "rb", buffering=0)
outfile = open(outfile_template.format(count), "wb", buffering=0)
while True:
byteswritten = 0
while byteswritten < chunksize:
bite = infile.read(bitesize)
# check for EOF
if not bite:
break
outfile.write(bite)
byteswritten += len(bite)
# check for EOF
if not bite:
break
for i in range(2):
l = infile.readline()
# check for EOF
if not l:
break
outfile.write(l)
# check for EOF
if not l:
break
outfile.close()
count += 1
print( count )
outfile = open(outfile_template.format(count), "wb", buffering=0)
outfile.close()
infile.close()
endtime = time.perf_counter()
elapsed = endtime-starttime
print( f"Elapsed= {elapsed}" )
NOTE I haven't exhaustively tested this doesn't lose data, although no evidence it does lose anything you should validate that yourself.
Might be useful to add some robustness by checking when at the end of a chunk to see how much data is left to read, so you don't end up with the last output file being 0-length (or shorter than bitesize)
HTH
barny
I used slightly modificated version of this for parsing 250GB json, I choose how many smaller files I need number_of_slices and then I find positions where to slice a file (I always look for line end). FInally i slice file with file.seek and file.read(chunk)
import os
import mmap
FULL_PATH_TO_FILE = 'full_path_to_a_big_file'
OUTPUT_PATH = 'full_path_to_a_output_dir' # where sliced files will be generated
def next_newline_finder(mmapf):
def nl_find(mmapf):
while 1:
current = hex(mmapf.read_byte())
if hex(ord('\n')) == current: # or whatever line-end symbol
return(mmapf.tell())
return nl_find(mmapf)
# find positions where to slice a file
file_info = os.stat(FULL_PATH_TO_FILE)
file_size = file_info.st_size
positions_for_file_slice = [0]
number_of_slices = 15 # say u want slice the big file to 15 smaller files
size_per_slice = file_size//number_of_slices
with open(FULL_PATH_TO_FILE, "r+b") as f:
mmapf = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
slice_counter = 1
while slice_counter < number_of_slices:
pos = size_per_slice*slice_counter
mmapf.seek(pos)
newline_pos = next_newline_finder(mmapf)
positions_for_file_slice.append(newline_pos)
slice_counter += 1
# create ranges for found positions (from, to)
positions_for_file_slice = [(pos, positions_for_file_slice[i+1]) if i < (len(positions_for_file_slice)-1) else (
positions_for_file_slice[i], file_size) for i, pos in enumerate(positions_for_file_slice)]
# do actual slice of a file
with open(FULL_PATH_TO_FILE, "rb") as f:
for i, position_pair in enumerate(positions_for_file_slice):
read_from, read_to = position_pair
f.seek(read_from)
chunk = f.read(read_to-read_from)
with open(os.path.join(OUTPUT_PATH, f'dummyfile{i}.json'), 'wb') as chunk_file:
chunk_file.write(chunk)
Here is an example of line-by-line writes. Its opened in binary mode to avoid the line decode step which takes a modest amount of time but can skew character counts. For instance, utf-8 encoding may use multiple bytes on disk for a single python character.
4 Meg is a guess at buffering. The idea is to get the operating system to read more of the file at once, reducing seek times. Whether this works or the best number to use is debatable - and will be different for different operating systems. I found 4 meg makes a difference... but that was years ago and things change.
outfile_template = "outfile-{}.txt"
infile_name = "infile.txt"
chunksize = 10_000_000_000
MEB = 2**20 # mebibyte
count = 0
byteswritten = 0
infile = open(infile_name, "rb", buffering=4*MEB)
outfile = open(outfile_template.format(count), "wb", buffering=4*MEB)
try:
for line in infile:
if byteswritten > chunksize:
outfile.close()
byteswritten = 0
count += 1
outfile = open(outfile_template.format(count), "wb", buffering=4*MEB)
outfile.write(line)
byteswritten += len(line)
finally:
infile.close()
outfile.close()
FYI I am new to Python and this website!
I have a csv file:
Product Number,Account Number,Transactions,Year Number,Left Output,Mid Output
43854835,12345,23123,12,12,45
4353454,23456,123213213,4,23,56
7657657,34567,321321,5,34,67
21321312,45678,321321,8,45,78
21312313,56789,2131233,3,56,89
If I want to refer to column 2 as the one where I need to conduct left and mid in Python, what is the best approach without libraries? I also want to append at the last column of the data as seen in the image.
This takes in a .csv file, reads the lines into a list, appends new data using the LEFT and MID functions and saves it to a new file (newFile.csv). This works according to the data in the imgur link.
Note: the script is hardly optimised; it was tested on ~2 million lines and it took a couple of minutes and ALOT of ram (2-3GB), so be careful about running this (backup original csv file, save work, close programs, etc ...)
I could modify this to batch process lines so memory is freed as well as maybe some sort of cache, but since I'm assuming it will be used sparingly, it should probably be fine.
filename = "myFile.csv"
# don't want to overwrite original
new_filename = "newFile.csv"
def LEFT(s, length):
# example: LEFT("apple",3) returns "app".
return str(s[:length])
def MID(s, start, length):
# example: MID("apple",2,3) returns "ppl"
return str(s[start - 1: start - 1 + length])
# read file contents into list
with open(filename, 'r') as file:
# store file data in a list
file_data = file.read().splitlines()
# loop and append new data to list
for i, line in enumerate(file_data):
# ignore header
if (i == 0): continue;
# parse 2nd column
second_column = line.split(",")[1]
# append 5th and 6th column
file_data[i] += "," + LEFT(second_column, 2) \
+ "," + MID(second_column, 4, 2)
# write modified list to new file
with open(new_filename, 'w') as file:
for line in file_data:
file.write(line + '\n')
I have a dictionary with 400,000 items in it, whose keys are DNA names and values are DNA sequences.
I want to divide the dictionary into 40 text files with 10,000 items in each of the files.
Here are my codes:
record_dict # my DNA dictionary
keys_in_dict #the list of the keys
for keys in keys_in_dict:
outhandle = open("D:\\Research\\Transcriptome_sequences\\input{0}.fasta".format (?????), "w")
What should I put in place of (?????)? How do I finish this loop?
UPDATE:
Hey fellows,
Thank you for your help. Now I can make multiple files from a dictionary. However, when I tried to make multiple files directly from the original file instead of making a dictionary first, I had problems. The codes only generate one file with the first item in it. What did I do wrong? Here are my codes:
from Bio import SeqIO
handle = open("D:/Research/Transcriptome_sequences/differentially_expressed_genes.fasta","rU")
filesize = 100 # number of entries per file
filenum = 0
itemcount = 0
for record in SeqIO.parse(handle, "fasta") :
if not itemcount % filesize:
outhandle = open("D:/Research/Transcriptome_sequences/input{0}.fasta".format(filenum), "w")
SeqIO.write(record, outhandle, "fasta")
filenum += 1
itemcount += 1
outhandle.close()
n = 10000
sections = (record_dict.items()[i:i+n] for i in xrange(0,len(record_dict),n))
for ind, sec in enumerate(sections):
with open("D:/Research/Transcriptome_sequences/input{0}.fasta".format(ind), "w") as f1:
for k,v in sec:
f1.write("{} {}\n".format(k,v))
It will not be the fastest solution, but I think the most straightforwared way is to keep track of lines and open a file every 10,000 iterations through loop.
I assume you are writing out fasta or something.
Otherwise, you could slice the list [:10000] beforehand and generate a chunk of output to write all at once with one command (which would be much faster). Even as it is, you might want to build up the string by concatenating through the loop and then writing that one monstrous string out with a single .write command for each file.
itemcount=0
filesize = 10000
filenum = 0
filehandle = ""
for keys in keys_in_dict:
# check if it is time to open a new file,
# whenever itemcount/filesize has no remainder
if not itemcount % filesize:
if filehandle:
filehandle.close()
filenum+=1
PathToFile = "D:/Research/Transcriptome_sequences/input{0}.fasta".format(filenum)
filehandle = open(PathToFile,'w')
filehandle.write(">{0}\n{1}\n".format(keys,record_dict[keys])
itemcount += 1
filehandle.close()
EDIT: Here is a more efficient way to do it (time-wise, not memory-wise), only writing once per file (40x total) instead of with each line (400,000 times). As always, check your output, especially making sure that the first and last sequences are included in the output and the last file is written properly.
filesize = 10 # number of entries per file
filenum = 0
filehandle = ""
OutString = ""
print record_dict
for itemcount,keys in enumerate(keys_in_dict):
# check if it is time to open a new file,
# whenever itemcount/filesize has no remainder
OutString += ">{0}\n{1}\n".format(keys,record_dict[keys])
if not itemcount % filesize:
if filehandle:
filehandle.write(OutString)
filehandle.close()
OutString =""
filenum+=1
PathToFile = "D:/Research/Transcriptome_sequences/input{0}.fasta".format(filenum)
filehandle = open(PathToFile,'w')
filehandle.write(OutString)
filehandle.close()
Making use of the built-in module/function, itertools.tee, could solve this elegantly.
import itertools
for (idx, keys2) in enumerate(itertools.tee(keys_in_dict, 40)):
with open('filename_prefix_%02d.fasta' % idx, 'w') as fout:
for key in keys2:
fout.write(...)
Quoted from the doc for your reference:
itertools.tee(iterable[, n=2]) Return n independent iterators from a
single iterable.
Once tee() has made a split, the original iterable should not be used
anywhere else; otherwise, the iterable could get advanced without the
tee objects being informed.
This itertool may require significant auxiliary storage (depending on
how much temporary data needs to be stored). In general, if one
iterator uses most or all of the data before another iterator starts,
it is faster to use list() instead of tee().
I routinely use PowerShell to split larger text or csv files in to smaller files for quicker processing. However, I have a few files that come over that are an usual format. These are basically print files to a text file. Each record starts with a single line that starts with a 1 and there is nothing else on the line.
What I need to be able to do is to split a file based on the number of statements. So, basically if I want to split the file in to chunks of 3000 statements, I would go down until I see the 3001 occurrence of 1 in position 1 and copy everything before that to the new file. I can run this from windows, linux or OS X so pretty much anything is open for the split.
Any ideas would be greatly appreciated.
Maybe try recognizing it by the fact that there is a '1' plus a new line?
with open(input_file, 'r') as f:
my_string = f.read()
my_list = my_string.split('\n1\n')
Separates each record to a list assuming it has the following format:
1
....
....
1
....
....
....
You can then output each element in the list to a separate file.
for x in range(len(my_list)):
print >> str(x)+'.txt', my_list[x]
To avoid loading the file in memory, you could define a function that generates records incrementally and then use itertool's grouper recipe to write each 3000 records to a new file:
#!/usr/bin/env python3
from itertools import zip_longest
with open('input.txt') as input_file:
files = zip_longest(*[generate_records(input_file)]*3000, filevalue=())
for n, records in enumerate(files):
open('output{n}.txt'.format(n=n), 'w') as output_file:
output_file.writelines(''.join(lines)
for r in records for lines in r)
where generate_records() yields one record at a time where a record is also an iterator over lines in the input file:
from itertools import chain
def generate_records(input_file, start='1\n', eof=[]):
def record(yield_start=True):
if yield_start:
yield start
for line in input_file:
if line == start: # start new record
break
yield line
else: # EOF
eof.append(True)
# the first record may include lines before the first 1\n
yield chain(record(yield_start=False),
record())
while not eof:
yield record()
generate_records() is a generator that yield generators like itertools.groupby() does.
For performance reasons, you could read/write chunks of multiple lines at once.
I have a 7GB csv file which I'd like to split into smaller chunks, so it is readable and faster for analysis in Python on a notebook. I would like to grab a small set from it, maybe 250MB, so how can I do this?
You don't need Python to split a csv file. Using your shell:
$ split -l 100 data.csv
Would split data.csv in chunks of 100 lines.
I had to do a similar task, and used the pandas package:
for i,chunk in enumerate(pd.read_csv('bigfile.csv', chunksize=500000)):
chunk.to_csv('chunk{}.csv'.format(i), index=False)
Here is a little python script I used to split a file data.csv into several CSV part files. The number of part files can be controlled with chunk_size (number of lines per part file).
The header line (column names) of the original file is copied into every part CSV file.
It works for big files because it reads one line at a time with readline() instead of loading the complete file into memory at once.
#!/usr/bin/env python3
def main():
chunk_size = 9998 # lines
def write_chunk(part, lines):
with open('data_part_'+ str(part) +'.csv', 'w') as f_out:
f_out.write(header)
f_out.writelines(lines)
with open('data.csv', 'r') as f:
count = 0
header = f.readline()
lines = []
for line in f:
count += 1
lines.append(line)
if count % chunk_size == 0:
write_chunk(count // chunk_size, lines)
lines = []
# write remainder
if len(lines) > 0:
write_chunk((count // chunk_size) + 1, lines)
if __name__ == '__main__':
main()
This graph shows the runtime difference of the different approaches outlined by other posters (on an 8 core machine when splitting a 2.9 GB file with 11.8 million rows of data into ~290 files).
The shell approach is from Thomas Orozco, Python approach s from Roberto, Pandas approach is from Quentin Febvre and here's the Dask snippet:
ddf = dd.read_csv("../nyc-parking-tickets/Parking_Violations_Issued_-_Fiscal_Year_2015.csv", blocksize=10000000, dtype=dtypes)
ddf.to_csv("../tmp/split_csv_dask")
I'd recommend Dask for splitting files, even though it's not the fastest, because it's the most flexible solution (you can write out different file formats, perform processing operations before writing, easily modify compression formats, etc.). The Pandas approach is almost as flexible, but cannot perform processing on the entire dataset (like sorting the entire dataset before writing).
Bash / native Python filesystem operations are clearly quicker, but that's not what I'm typically looking for when I have a large CSV. I'm typically interested in splitting large CSVs into smaller Parquet files, for performant, production data analyses. I don't usually care if the actually splitting takes a couple minutes more. I'm more interested in splitting accurately.
I wrote a blog post that discusses this in more detail. You can probably Google around and find the post.
See the Python docs on file objects (the object returned by open(filename) - you can choose to read a specified number of bytes, or use readline to work through one line at a time.
Maybe something like this?
#!/usr/local/cpython-3.3/bin/python
import csv
divisor = 10
outfileno = 1
outfile = None
with open('big.csv', 'r') as infile:
for index, row in enumerate(csv.reader(infile)):
if index % divisor == 0:
if outfile is not None:
outfile.close()
outfilename = 'big-{}.csv'.format(outfileno)
outfile = open(outfilename, 'w')
outfileno += 1
writer = csv.writer(outfile)
writer.writerow(row)
I agree with #jonrsharpe readline should be able to read one line at a time even for big files.
If you are dealing with big csv files might I suggest using pandas.read_csv. I often use it for the same purpose and always find it awesome (and fast). Takes a bit of time to get used to idea of DataFrames. But once you get over that it speeds up large operations like yours massively.
Hope it helps.
here is my code which might help
import os
import pandas as pd
import uuid
class FileSettings(object):
def __init__(self, file_name, row_size=100):
self.file_name = file_name
self.row_size = row_size
class FileSplitter(object):
def __init__(self, file_settings):
self.file_settings = file_settings
if type(self.file_settings).__name__ != "FileSettings":
raise Exception("Please pass correct instance ")
self.df = pd.read_csv(self.file_settings.file_name,
chunksize=self.file_settings.row_size)
def run(self, directory="temp"):
try:os.makedirs(directory)
except Exception as e:pass
counter = 0
while True:
try:
file_name = "{}/{}_{}_row_{}_{}.csv".format(
directory, self.file_settings.file_name.split(".")[0], counter, self.file_settings.row_size, uuid.uuid4().__str__()
)
df = next(self.df).to_csv(file_name)
counter = counter + 1
except StopIteration:
break
except Exception as e:
print("Error:",e)
break
return True
def main():
helper = FileSplitter(FileSettings(
file_name='sample1.csv',
row_size=10
))
helper.run()
main()
In the case of wanting to split by rough boundaries in bytes, the newest datapoints being the bottom-most ones and wanting to put the newest datapoints in the first file:
from pathlib import Path
TEN_MB = 10000000
FIVE_MB = 5000000
def split_file_into_chunks(path, chunk_size=TEN_MB):
path = str(path)
output_prefix = path.rpartition('.')[0]
output_ext = path.rpartition('.')[-1]
with open(path, 'rb') as f:
seek_positions = []
for x, line in enumerate(f):
if not x:
header = line
seek_positions.append(f.tell())
part = 0
last_seek_pos = seek_positions[-1]
for seek_pos in reversed(seek_positions):
if last_seek_pos-seek_pos >= chunk_size:
with open(f'{output_prefix}.arch.{part}.{output_ext}', 'wb') as f_out:
f.seek(seek_pos)
f_out.write(header)
f_out.write(f.read(last_seek_pos-seek_pos))
last_seek_pos = seek_pos
part += 1
with open(f'{output_prefix}.arch.{part}.{output_ext}', 'wb') as f_out:
f.seek(0)
f_out.write(f.read(last_seek_pos))
Path(path).rename(path+'~')
Path(f'{output_prefix}.arch.0.{output_ext}').rename(path)
Path(path+'~').unlink()