I am trying to convert a file containing more than 1 billion bytes into integers. Obviously, my machine cannot do this at once so I need to chunk my code. I was able to decode the first 50,000,000 bytes but I am wondering how to read the integers in the file that are between 50,000,001 and 100,000,000, 150,000,000 and 200,000,000 etc. The following is what I have now;the range function is not working with this.
import struct
with open(x, "rb") as f:
this_chunk = range(50000001, 100000000)
data = f.read(this_chunk)
ints1 = struct.unpack("I" * (this_chunk //4) , data)
print(ints1)
You can use f.seek(offset) to set the file pointer to start reading from a certain offset.
In your case, you'd want to skip 5000000 bytes, so you'd call
f.seek(50000000)
At this point, you'd want to read another 50000000 bytes, so you'd call f.read(50000000).
This would be your complete code listing, implementing f.seek and reading the whole file:
with open(x, "rb") as f:
f.seek(50000000) # omit if you don't want to skip this chunk
data = f.read(50000000)
while data:
... # do something
data = f.read(50000000)
Use f.read(50000000) in a loop at it will read the file in chunks of 50000000, e.g.:
In []:
from io import StringIO
s = '''hello'''
with StringIO(s) as f:
while True:
c = f.read(2)
if not c:
break
print(c)
Out[]:
he
ll
o
Related
I need to read chunks of 64KB in loop, and process them, but stop at the end of file minus 16 bytes: the last 16 bytes are a tag metadata.
The file might be super large, so I can't read it all in RAM.
All the solutions I find are a bit clumsy and/or unpythonic.
with open('myfile', 'rb') as f:
while True:
block = f.read(65536)
if not block:
break
process_block(block)
If 16 <= len(block) < 65536, it's easy: it's the last block ever. So useful_data = block[:-16] and tag = block[-16:]
If len(block) == 65536, it could mean three things: that the full block is useful data. Or that this 64KB block is in fact the last block, so useful_data = block[:-16] and tag = block[-16:]. Or that this 64KB block is followed by another block of only a few bytes (let's say 3 bytes), so in this case: useful_data = block[:-13] and tag = block[-13:] + last_block[:3].
How to deal with this problem in a nicer way than distinguishing all these cases?
Note:
the solution should work for a file opened with open(...), but also for a io.BytesIO() object, or for a distant SFTP opened file (with pysftp).
I was thinking about getting the file object size, with
f.seek(0,2)
length = f.tell()
f.seek(0)
Then after each
block = f.read(65536)
we can know if we are far from the end with length - f.tell(), but again the full solution does not look very elegant.
you can just read in every iteration min(65536, L-f.tell()-16)
Something like this:
from pathlib import Path
L = Path('myfile').stat().st_size
with open('myfile', 'rb') as f:
while True:
to_read_length = min(65536, L-f.tell()-16)
block = f.read(to_read_length)
process_block(block)
if f.tell() == L-16
break
Did not ran this, but hope you get the gist of it.
The following method relies only on the fact that the f.read() method returns an empty bytes object upon end of stream (EOS). It thus could be adopted for sockets simply by replacing f.read() with s.recv().
def read_all_but_last16(f):
rand = random.Random() # just for testing
buf = b''
while True:
bytes_read = f.read(rand.randint(1, 40)) # just for testing
# bytes_read = f.read(65536)
buf += bytes_read
if not bytes_read:
break
process_block(buf[:-16])
buf = buf[-16:]
verify(buf[-16:])
It works by always leaving 16 bytes at the end of buf until EOS, then finally processing the last 16. Note that if there aren't at least 17 bytes in buf then buf[:-16] returns the empty bytes object.
I am trying to read a big file of 30 MB character by character. I found an interesting article on how to read a big file. Fast Method to Stream Big files
Problem: Output printing binary data instead of actual human readable text
Code:
def getRow(filepath):
offsets = get_offsets(filepath)
random.shuffle(offsets)
with gzip.open(filepath, "r+b") as f:
i = 0
mm = mmap.mmap(f.fileno(), 0, access = mmap.ACCESS_READ)
for position in offsets:
mm.seek(position)
record = mm.readline()
x = record.split(",")
yield x
def get_offsets(input_filename):
offsets = []
with open(input_filename, 'r+b') as f:
i = 0
mm = mmap.mmap(f.fileno(), 0, access = mmap.ACCESS_READ)
for record in iter(mm.readline, ''):
loc = mm.tell()
offsets.append(loc)
i += 1
return offsets
for line in getRow("hello.dat.gz"):
print line
Output: The output is producing some weird binary data.
['w\xc1\xd9S\xabP8xy\x8f\xd8\xae\xe3\xd8b&\xb6"\xbeZ\xf3P\xdc\x19&H\\#\x8e\x83\x0b\x81?R\xb0\xf2\xb5\xc1\x88rJ\
Am I doing something terribly stupid?
EDIT:
I found the problem. It is because of gzip.open. Not sure how to get rid of this. Any ideas?
As per the documentation of GZipFile:
fileno(self)
Invoke the underlying file object's `fileno()` method.
You are mapping a view of the compressed .gz file, not a view of the compressed data.
mmap() can only operate on OS file handles, it cannot map arbitrary Python file objects.
So no, you cannot transparently map a decompressed view of a compressed file unless this is supported directly by the underlying operating system.
I am reading some value for file and wants to write modified value into file. My file is .ktx format [binary packed format].
I am using struct.pack() but seems that something is going wrong with that:
bytes = file.read(4)
bytesAsInt = struct.unpack("l",bytes)
number=1+(bytesAsInt[0])
number=hex(number)
no=struct.pack("1",number)
outfile.write(no)
I want to write in both ways little-endian and big-endian.
no_little =struct.pack(">1",bytesAsInt)
no_big =struct.pack("<1",bytesAsInt) # i think this is default ...
again you can check the docs and see the format characters you need
https://docs.python.org/3/library/struct.html
>>> struct.unpack("l","\x05\x04\x03\03")
(50529285,)
>>> struct.pack("l",50529285)
'\x05\x04\x03\x03'
>>> struct.pack("<l",50529285)
'\x05\x04\x03\x03'
>>> struct.pack(">l",50529285)
'\x03\x03\x04\x05'
also note that it is a lowercase L , not a one (as also covered in the docs)
I haven't tested this but the following function should solve your problem. At the moment it reads the file contents completely, creates a buffer and then writes out the updated contents. You could also modify the file buffer directly using unpack_from and pack_into but it might be slower (again, not tested). I'm using the struct.Struct class since you seem to want to unpack the same number many times.
import os
import struct
from StringIO import StringIO
def modify_values(in_file, out_file, increment=1, num_code="i", endian="<"):
with open(in_file, "rb") as file_h:
content = file_h.read()
num = struct.Struct(endian + num_code)
buf = StringIO()
try:
while len(content) >= num.size:
value = num.unpack(content[:num.size])[0]
value += increment
buf.write(num.pack(value))
content = content[num.size:]
except Exception as err:
# handle
else:
buf.seek(0)
with open(out_file, "wb") as file_h:
file_h.write(buf.read())
An alternative is to use the array which makes it quite easy. I don't know how to implement endianess with an array.
def modify_values(filename, increment=1, num_code="i"):
with open(filename, "rb") as file_h:
arr = array("i", file_h.read())
for i in range(len(arr)):
arr[i] += increment
with open(filename, "wb") as file_h:
arr.tofile(file_h)
I have a binary file that consists of consecutive binary subsequences of fixed and equal length. Each subsequence can be unpacked into the same number of values. I know the length of each subsequence and the binary format of the values.
How can I work through the binary file, chopping out the subsequences,
unpacking their content and write them out as csv as I go.
I know how to write out as csv. My problem is the reading from file and unpacking part. This is my non-working code.
import csv
import sys
import struct
writer = csv.writer(sys.stdout, delimiter=',', quoting=csv.QUOTE_NONE,escapechar='\\')
? rows = sys.stdin. ?
? header = id, time ....
? write the header with csv
i = 0
for row in rows:
unpacked_row = unpack('QqqqddiBIBcsbshlshhlQB',row)
writer.writerow(unpacked_row)
i += 1
Possible solution using Reading binary file in Python and looping over each byte and the answer of ignacio.
First calculate chunksize = struct.calcsize()
def bytes_from_file(filename, chunksize=8192):
with open(filename, "rb") as f:
while True:
chunk = f.read(chunksize)
if chunk:
yield chunk
else:
break
# example:
for chunk in bytes_from_file('filename'):
# row = unpack(chunk)
# write out row as csv
You need to calculate the size of the structure (Hint: struct.calcsize()) and read some multiple of that from the file at a time. You cannot directly iterate over the input as you can with a text file, since there is no delimiter as such.
You could use struct.Struct to unpack values from a file:
#!/usr/bin/env python
import csv
import sys
from struct import Struct
record = Struct('QqqqddiBIBcsbshlshhlQB')
with open('input_filename', 'rb') as file:
writer = csv.writer(sys.stdout, quoting=csv.QUOTE_NONE, escapechar='\\')
while True:
buf = file.read(record.size)
if len(buf) != record.size:
break
writer.writerow(record.unpack_from(buf))
You could also write the while-loop as:
from functools import partial
for buf in iter(partial(file.read, record.size), b''):
writer.writerow(record.unpack_from(buf))
I have a 7GB csv file which I'd like to split into smaller chunks, so it is readable and faster for analysis in Python on a notebook. I would like to grab a small set from it, maybe 250MB, so how can I do this?
You don't need Python to split a csv file. Using your shell:
$ split -l 100 data.csv
Would split data.csv in chunks of 100 lines.
I had to do a similar task, and used the pandas package:
for i,chunk in enumerate(pd.read_csv('bigfile.csv', chunksize=500000)):
chunk.to_csv('chunk{}.csv'.format(i), index=False)
Here is a little python script I used to split a file data.csv into several CSV part files. The number of part files can be controlled with chunk_size (number of lines per part file).
The header line (column names) of the original file is copied into every part CSV file.
It works for big files because it reads one line at a time with readline() instead of loading the complete file into memory at once.
#!/usr/bin/env python3
def main():
chunk_size = 9998 # lines
def write_chunk(part, lines):
with open('data_part_'+ str(part) +'.csv', 'w') as f_out:
f_out.write(header)
f_out.writelines(lines)
with open('data.csv', 'r') as f:
count = 0
header = f.readline()
lines = []
for line in f:
count += 1
lines.append(line)
if count % chunk_size == 0:
write_chunk(count // chunk_size, lines)
lines = []
# write remainder
if len(lines) > 0:
write_chunk((count // chunk_size) + 1, lines)
if __name__ == '__main__':
main()
This graph shows the runtime difference of the different approaches outlined by other posters (on an 8 core machine when splitting a 2.9 GB file with 11.8 million rows of data into ~290 files).
The shell approach is from Thomas Orozco, Python approach s from Roberto, Pandas approach is from Quentin Febvre and here's the Dask snippet:
ddf = dd.read_csv("../nyc-parking-tickets/Parking_Violations_Issued_-_Fiscal_Year_2015.csv", blocksize=10000000, dtype=dtypes)
ddf.to_csv("../tmp/split_csv_dask")
I'd recommend Dask for splitting files, even though it's not the fastest, because it's the most flexible solution (you can write out different file formats, perform processing operations before writing, easily modify compression formats, etc.). The Pandas approach is almost as flexible, but cannot perform processing on the entire dataset (like sorting the entire dataset before writing).
Bash / native Python filesystem operations are clearly quicker, but that's not what I'm typically looking for when I have a large CSV. I'm typically interested in splitting large CSVs into smaller Parquet files, for performant, production data analyses. I don't usually care if the actually splitting takes a couple minutes more. I'm more interested in splitting accurately.
I wrote a blog post that discusses this in more detail. You can probably Google around and find the post.
See the Python docs on file objects (the object returned by open(filename) - you can choose to read a specified number of bytes, or use readline to work through one line at a time.
Maybe something like this?
#!/usr/local/cpython-3.3/bin/python
import csv
divisor = 10
outfileno = 1
outfile = None
with open('big.csv', 'r') as infile:
for index, row in enumerate(csv.reader(infile)):
if index % divisor == 0:
if outfile is not None:
outfile.close()
outfilename = 'big-{}.csv'.format(outfileno)
outfile = open(outfilename, 'w')
outfileno += 1
writer = csv.writer(outfile)
writer.writerow(row)
I agree with #jonrsharpe readline should be able to read one line at a time even for big files.
If you are dealing with big csv files might I suggest using pandas.read_csv. I often use it for the same purpose and always find it awesome (and fast). Takes a bit of time to get used to idea of DataFrames. But once you get over that it speeds up large operations like yours massively.
Hope it helps.
here is my code which might help
import os
import pandas as pd
import uuid
class FileSettings(object):
def __init__(self, file_name, row_size=100):
self.file_name = file_name
self.row_size = row_size
class FileSplitter(object):
def __init__(self, file_settings):
self.file_settings = file_settings
if type(self.file_settings).__name__ != "FileSettings":
raise Exception("Please pass correct instance ")
self.df = pd.read_csv(self.file_settings.file_name,
chunksize=self.file_settings.row_size)
def run(self, directory="temp"):
try:os.makedirs(directory)
except Exception as e:pass
counter = 0
while True:
try:
file_name = "{}/{}_{}_row_{}_{}.csv".format(
directory, self.file_settings.file_name.split(".")[0], counter, self.file_settings.row_size, uuid.uuid4().__str__()
)
df = next(self.df).to_csv(file_name)
counter = counter + 1
except StopIteration:
break
except Exception as e:
print("Error:",e)
break
return True
def main():
helper = FileSplitter(FileSettings(
file_name='sample1.csv',
row_size=10
))
helper.run()
main()
In the case of wanting to split by rough boundaries in bytes, the newest datapoints being the bottom-most ones and wanting to put the newest datapoints in the first file:
from pathlib import Path
TEN_MB = 10000000
FIVE_MB = 5000000
def split_file_into_chunks(path, chunk_size=TEN_MB):
path = str(path)
output_prefix = path.rpartition('.')[0]
output_ext = path.rpartition('.')[-1]
with open(path, 'rb') as f:
seek_positions = []
for x, line in enumerate(f):
if not x:
header = line
seek_positions.append(f.tell())
part = 0
last_seek_pos = seek_positions[-1]
for seek_pos in reversed(seek_positions):
if last_seek_pos-seek_pos >= chunk_size:
with open(f'{output_prefix}.arch.{part}.{output_ext}', 'wb') as f_out:
f.seek(seek_pos)
f_out.write(header)
f_out.write(f.read(last_seek_pos-seek_pos))
last_seek_pos = seek_pos
part += 1
with open(f'{output_prefix}.arch.{part}.{output_ext}', 'wb') as f_out:
f.seek(0)
f_out.write(f.read(last_seek_pos))
Path(path).rename(path+'~')
Path(f'{output_prefix}.arch.0.{output_ext}').rename(path)
Path(path+'~').unlink()