I am trying to read a big file of 30 MB character by character. I found an interesting article on how to read a big file. Fast Method to Stream Big files
Problem: Output printing binary data instead of actual human readable text
Code:
def getRow(filepath):
offsets = get_offsets(filepath)
random.shuffle(offsets)
with gzip.open(filepath, "r+b") as f:
i = 0
mm = mmap.mmap(f.fileno(), 0, access = mmap.ACCESS_READ)
for position in offsets:
mm.seek(position)
record = mm.readline()
x = record.split(",")
yield x
def get_offsets(input_filename):
offsets = []
with open(input_filename, 'r+b') as f:
i = 0
mm = mmap.mmap(f.fileno(), 0, access = mmap.ACCESS_READ)
for record in iter(mm.readline, ''):
loc = mm.tell()
offsets.append(loc)
i += 1
return offsets
for line in getRow("hello.dat.gz"):
print line
Output: The output is producing some weird binary data.
['w\xc1\xd9S\xabP8xy\x8f\xd8\xae\xe3\xd8b&\xb6"\xbeZ\xf3P\xdc\x19&H\\#\x8e\x83\x0b\x81?R\xb0\xf2\xb5\xc1\x88rJ\
Am I doing something terribly stupid?
EDIT:
I found the problem. It is because of gzip.open. Not sure how to get rid of this. Any ideas?
As per the documentation of GZipFile:
fileno(self)
Invoke the underlying file object's `fileno()` method.
You are mapping a view of the compressed .gz file, not a view of the compressed data.
mmap() can only operate on OS file handles, it cannot map arbitrary Python file objects.
So no, you cannot transparently map a decompressed view of a compressed file unless this is supported directly by the underlying operating system.
Related
I am trying to batch a very large text file (approximately 150 gigabytes) into several smaller text files (approximately 10 gigabytes).
My general process will be:
# iterate over file one line at a time
# accumulate batch as string
--> # given a certain count that correlates to the size of my current accumulated batch and when that size is met: (this is where I am unsure)
# write to file
# accumulate size count
I have a rough metric to calculate when to batch (when the desired batch size) but am not so clear how I should calculate how often to write to disk for a given batch. For example, if my batch size is 10 gigabytes, I assume I will need to iteratively write rather than hold the entire 10 gigbyte batch in memory. I obviously do not want to write more than I have to as this could be quite expensive.
Do ya'll have any rough calculations or tricks that you like to use to figure out when to write to disk for task such as this, e.g. size vs memory or something?
Assuming your large file is simple unstructured text, i.e. this is no good for structured text like JSON, here's an alternative to reading every single line: read large binary bites of the input file until at your chunksize then read a couple of lines, close the current output file and move on to the next.
I compared this with line-by-line using #tdelaney code adapted with the same chunksize as my code - that code took 250s to split a 12GiB input file into 6x2GiB chunks, whereas this took ~50s so maybe five times faster and looks like it's I/O bound on my SSD running >200MiB/s read and write, where the line-by-line was running 40-50MiB/s read and write.
I turned buffering off because there's not a lot of point. The size of bite and the buffering setting may be tuneable to improve performance, haven't tried any other settings as for me it seems to be I/O bound anyway.
import time
outfile_template = "outfile-{}.txt"
infile_name = "large.text"
chunksize = 2_000_000_000
MEB = 2**20 # mebibyte
bitesize = 4_000_000 # the size of the reads (and writes) working up to chunksize
count = 0
starttime = time.perf_counter()
infile = open(infile_name, "rb", buffering=0)
outfile = open(outfile_template.format(count), "wb", buffering=0)
while True:
byteswritten = 0
while byteswritten < chunksize:
bite = infile.read(bitesize)
# check for EOF
if not bite:
break
outfile.write(bite)
byteswritten += len(bite)
# check for EOF
if not bite:
break
for i in range(2):
l = infile.readline()
# check for EOF
if not l:
break
outfile.write(l)
# check for EOF
if not l:
break
outfile.close()
count += 1
print( count )
outfile = open(outfile_template.format(count), "wb", buffering=0)
outfile.close()
infile.close()
endtime = time.perf_counter()
elapsed = endtime-starttime
print( f"Elapsed= {elapsed}" )
NOTE I haven't exhaustively tested this doesn't lose data, although no evidence it does lose anything you should validate that yourself.
Might be useful to add some robustness by checking when at the end of a chunk to see how much data is left to read, so you don't end up with the last output file being 0-length (or shorter than bitesize)
HTH
barny
I used slightly modificated version of this for parsing 250GB json, I choose how many smaller files I need number_of_slices and then I find positions where to slice a file (I always look for line end). FInally i slice file with file.seek and file.read(chunk)
import os
import mmap
FULL_PATH_TO_FILE = 'full_path_to_a_big_file'
OUTPUT_PATH = 'full_path_to_a_output_dir' # where sliced files will be generated
def next_newline_finder(mmapf):
def nl_find(mmapf):
while 1:
current = hex(mmapf.read_byte())
if hex(ord('\n')) == current: # or whatever line-end symbol
return(mmapf.tell())
return nl_find(mmapf)
# find positions where to slice a file
file_info = os.stat(FULL_PATH_TO_FILE)
file_size = file_info.st_size
positions_for_file_slice = [0]
number_of_slices = 15 # say u want slice the big file to 15 smaller files
size_per_slice = file_size//number_of_slices
with open(FULL_PATH_TO_FILE, "r+b") as f:
mmapf = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
slice_counter = 1
while slice_counter < number_of_slices:
pos = size_per_slice*slice_counter
mmapf.seek(pos)
newline_pos = next_newline_finder(mmapf)
positions_for_file_slice.append(newline_pos)
slice_counter += 1
# create ranges for found positions (from, to)
positions_for_file_slice = [(pos, positions_for_file_slice[i+1]) if i < (len(positions_for_file_slice)-1) else (
positions_for_file_slice[i], file_size) for i, pos in enumerate(positions_for_file_slice)]
# do actual slice of a file
with open(FULL_PATH_TO_FILE, "rb") as f:
for i, position_pair in enumerate(positions_for_file_slice):
read_from, read_to = position_pair
f.seek(read_from)
chunk = f.read(read_to-read_from)
with open(os.path.join(OUTPUT_PATH, f'dummyfile{i}.json'), 'wb') as chunk_file:
chunk_file.write(chunk)
Here is an example of line-by-line writes. Its opened in binary mode to avoid the line decode step which takes a modest amount of time but can skew character counts. For instance, utf-8 encoding may use multiple bytes on disk for a single python character.
4 Meg is a guess at buffering. The idea is to get the operating system to read more of the file at once, reducing seek times. Whether this works or the best number to use is debatable - and will be different for different operating systems. I found 4 meg makes a difference... but that was years ago and things change.
outfile_template = "outfile-{}.txt"
infile_name = "infile.txt"
chunksize = 10_000_000_000
MEB = 2**20 # mebibyte
count = 0
byteswritten = 0
infile = open(infile_name, "rb", buffering=4*MEB)
outfile = open(outfile_template.format(count), "wb", buffering=4*MEB)
try:
for line in infile:
if byteswritten > chunksize:
outfile.close()
byteswritten = 0
count += 1
outfile = open(outfile_template.format(count), "wb", buffering=4*MEB)
outfile.write(line)
byteswritten += len(line)
finally:
infile.close()
outfile.close()
I am reading hex data from a .csv file that has multiple rows (example format: FFFDF3FFFBF2FFFAF210FFF0) using the following code:
with open('c:\\temp\\results.csv') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=",")
line_count = 0
file = open('c:\\temp\\sent.csv', 'w')
for row in csv_reader:
hex_string = f'{row[0]}'
bytes_object = bytes.fromhex(hex_string)
file.write(str(bytes_object) + '\n')
line_count += 1
file.close()
The output file contains mutliple rows that are converted to this format (sorry new to python so not sure if this is a bytearray or what it is actually called): b'\xff\xfd\xf3\xff\xfb\xf2\xff\xfa\xf2\x10\xff\xf0'
I am trying to convert back from this format to the orginal format reading the rows of the newly created .csv file (need to edit readable ascii in the file and covert back for use in another program).
file = open('c:\\temp\\sent.csv', 'r')
for row in file:
byte_string = row
# hex_object = byte_string.hex()
#THIS works if I enter the byte array in directly, but not if reading
#from file hex_object = byte_string.hex()
hex_object = b'\xff\xfd\x03\xff\xfb\x03\xff\xfd\x01\xff\xfb\x17\xff\xfa\xff\xf0\xff\xfd\x00\xff\xfb\x00'.hex()
print(hex_object)
#print(byte_string)
# writer.writerow(hex_object)
Is there a way to get this to work? I have tried several encoding methods, but since the data is already in the proper format I really just need to get it in a readable type for the .hex() method. I am using the latest version of Python 3.8.1enter code here
You are storing a textual representation of your bytes object and then trying to read it back without conversion to/from binary. Instead you are better off opening the output file in binary format like this:
file = open('c:\\temp\\sent.csv', 'wb')
and write the bytes to file:
bytes_object = bytes.fromhex(hex_string)
file.write(bytes_object)
(no need for newline character).
Then to do the opposite open in binary format:
with open('c:\\temp\\sent.csv', "rb") as f:
data = f.read()
s = data.hex()
print(s)
Here data is a bytes object and it has the hex() function you are looking for.
I am trying to convert a file containing more than 1 billion bytes into integers. Obviously, my machine cannot do this at once so I need to chunk my code. I was able to decode the first 50,000,000 bytes but I am wondering how to read the integers in the file that are between 50,000,001 and 100,000,000, 150,000,000 and 200,000,000 etc. The following is what I have now;the range function is not working with this.
import struct
with open(x, "rb") as f:
this_chunk = range(50000001, 100000000)
data = f.read(this_chunk)
ints1 = struct.unpack("I" * (this_chunk //4) , data)
print(ints1)
You can use f.seek(offset) to set the file pointer to start reading from a certain offset.
In your case, you'd want to skip 5000000 bytes, so you'd call
f.seek(50000000)
At this point, you'd want to read another 50000000 bytes, so you'd call f.read(50000000).
This would be your complete code listing, implementing f.seek and reading the whole file:
with open(x, "rb") as f:
f.seek(50000000) # omit if you don't want to skip this chunk
data = f.read(50000000)
while data:
... # do something
data = f.read(50000000)
Use f.read(50000000) in a loop at it will read the file in chunks of 50000000, e.g.:
In []:
from io import StringIO
s = '''hello'''
with StringIO(s) as f:
while True:
c = f.read(2)
if not c:
break
print(c)
Out[]:
he
ll
o
I am working on a script where it will breakdown another python script into blocks and using pycrypto to encrypt the blocks (all of this i have successfully done so far), now i am storing the encrypted blocks to a file so that the decrypter can read it and execute each block. The final result of the encryption is a list of binary outputs (something like blocks=[b'\xa1\r\xa594\x92z\xf8\x16\xaa',b'xfbI\xfdqx|\xcd\xdb\x1b\xb3',etc...]).
When writing the output to a file, they all end up into one giant line, so that when reading the file, all the bytes come back in one giant line, instead of each item from the original list. I also tried converting the bytes into a string, and adding a '\n' at the end of each one, but the problem there is that I still need the bytes, and I can't figure out how to undo the string to get the original byte.
To summarize this, i am looking to either: write each binary item to a separate line in a file so i can easily read the data and use it in the decryption, or i could translate the data to a string and in the decrpytion undo the string to get back the original binary data.
Here is the code for writing to the file:
new_file = open('C:/Python34/testfile.txt','wb')
for byte_item in byte_list:
# This or for the string i just replaced wb with w and
# byte_item with ascii(byte_item) + '\n'
new_file.write(byte_item)
new_file.close()
and for reading the file:
# Or 'r' instead of 'rb' if using string method
byte_list = open('C:/Python34/testfile.txt','rb').readlines()
A file is a stream of bytes without any implied structure. If you want to load a list of binary blobs then you should store some additional metadata to restore the structure e.g., you could use the netstring format:
#!/usr/bin/env python
blocks = [b'\xa1\r\xa594\x92z\xf8\x16\xaa', b'xfbI\xfdqx|\xcd\xdb\x1b\xb3']
# save blocks
with open('blocks.netstring', 'wb') as output_file:
for blob in blocks:
# [len]":"[string]","
output_file.write(str(len(blob)).encode())
output_file.write(b":")
output_file.write(blob)
output_file.write(b",")
Read them back:
#!/usr/bin/env python3
import re
from mmap import ACCESS_READ, mmap
blocks = []
match_size = re.compile(br'(\d+):').match
with open('blocks.netstring', 'rb') as file, \
mmap(file.fileno(), 0, access=ACCESS_READ) as mm:
position = 0
for m in iter(lambda: match_size(mm, position), None):
i, size = m.end(), int(m.group(1))
blocks.append(mm[i:i + size])
position = i + size + 1 # shift to the next netstring
print(blocks)
As an alternative, you could consider BSON format for your data or ascii armor format.
I think what you're looking for is byte_list=open('C:/Python34/testfile.txt','rb').read()
If you know how many bytes each item is, you can use read(number_of_bytes) to process one item at a time.
read() will read the entire file, but then it is up to you to decode that entire list of bytes into their respective items.
In general, since you're using Python 3, you will be working with bytes objects (which are immutable) and/or bytearray objects (which are mutable).
Example:
b1 = bytearray('hello', 'utf-8')
print b1
b1 += bytearray(' goodbye', 'utf-8')
print b1
open('temp.bin', 'wb').write(b1)
#------
b2 = open('temp.bin', 'rb').read()
print b2
Output:
bytearray(b'hello')
bytearray(b'hello goodbye')
b'hello goodbye'
I am reading some value for file and wants to write modified value into file. My file is .ktx format [binary packed format].
I am using struct.pack() but seems that something is going wrong with that:
bytes = file.read(4)
bytesAsInt = struct.unpack("l",bytes)
number=1+(bytesAsInt[0])
number=hex(number)
no=struct.pack("1",number)
outfile.write(no)
I want to write in both ways little-endian and big-endian.
no_little =struct.pack(">1",bytesAsInt)
no_big =struct.pack("<1",bytesAsInt) # i think this is default ...
again you can check the docs and see the format characters you need
https://docs.python.org/3/library/struct.html
>>> struct.unpack("l","\x05\x04\x03\03")
(50529285,)
>>> struct.pack("l",50529285)
'\x05\x04\x03\x03'
>>> struct.pack("<l",50529285)
'\x05\x04\x03\x03'
>>> struct.pack(">l",50529285)
'\x03\x03\x04\x05'
also note that it is a lowercase L , not a one (as also covered in the docs)
I haven't tested this but the following function should solve your problem. At the moment it reads the file contents completely, creates a buffer and then writes out the updated contents. You could also modify the file buffer directly using unpack_from and pack_into but it might be slower (again, not tested). I'm using the struct.Struct class since you seem to want to unpack the same number many times.
import os
import struct
from StringIO import StringIO
def modify_values(in_file, out_file, increment=1, num_code="i", endian="<"):
with open(in_file, "rb") as file_h:
content = file_h.read()
num = struct.Struct(endian + num_code)
buf = StringIO()
try:
while len(content) >= num.size:
value = num.unpack(content[:num.size])[0]
value += increment
buf.write(num.pack(value))
content = content[num.size:]
except Exception as err:
# handle
else:
buf.seek(0)
with open(out_file, "wb") as file_h:
file_h.write(buf.read())
An alternative is to use the array which makes it quite easy. I don't know how to implement endianess with an array.
def modify_values(filename, increment=1, num_code="i"):
with open(filename, "rb") as file_h:
arr = array("i", file_h.read())
for i in range(len(arr)):
arr[i] += increment
with open(filename, "wb") as file_h:
arr.tofile(file_h)