writing data into file with binary packed format in python - python

I am reading some value for file and wants to write modified value into file. My file is .ktx format [binary packed format].
I am using struct.pack() but seems that something is going wrong with that:
bytes = file.read(4)
bytesAsInt = struct.unpack("l",bytes)
number=1+(bytesAsInt[0])
number=hex(number)
no=struct.pack("1",number)
outfile.write(no)
I want to write in both ways little-endian and big-endian.

no_little =struct.pack(">1",bytesAsInt)
no_big =struct.pack("<1",bytesAsInt) # i think this is default ...
again you can check the docs and see the format characters you need
https://docs.python.org/3/library/struct.html
>>> struct.unpack("l","\x05\x04\x03\03")
(50529285,)
>>> struct.pack("l",50529285)
'\x05\x04\x03\x03'
>>> struct.pack("<l",50529285)
'\x05\x04\x03\x03'
>>> struct.pack(">l",50529285)
'\x03\x03\x04\x05'
also note that it is a lowercase L , not a one (as also covered in the docs)

I haven't tested this but the following function should solve your problem. At the moment it reads the file contents completely, creates a buffer and then writes out the updated contents. You could also modify the file buffer directly using unpack_from and pack_into but it might be slower (again, not tested). I'm using the struct.Struct class since you seem to want to unpack the same number many times.
import os
import struct
from StringIO import StringIO
def modify_values(in_file, out_file, increment=1, num_code="i", endian="<"):
with open(in_file, "rb") as file_h:
content = file_h.read()
num = struct.Struct(endian + num_code)
buf = StringIO()
try:
while len(content) >= num.size:
value = num.unpack(content[:num.size])[0]
value += increment
buf.write(num.pack(value))
content = content[num.size:]
except Exception as err:
# handle
else:
buf.seek(0)
with open(out_file, "wb") as file_h:
file_h.write(buf.read())
An alternative is to use the array which makes it quite easy. I don't know how to implement endianess with an array.
def modify_values(filename, increment=1, num_code="i"):
with open(filename, "rb") as file_h:
arr = array("i", file_h.read())
for i in range(len(arr)):
arr[i] += increment
with open(filename, "wb") as file_h:
arr.tofile(file_h)

Related

Read a file in byte chunks using python

I am trying to convert a file containing more than 1 billion bytes into integers. Obviously, my machine cannot do this at once so I need to chunk my code. I was able to decode the first 50,000,000 bytes but I am wondering how to read the integers in the file that are between 50,000,001 and 100,000,000, 150,000,000 and 200,000,000 etc. The following is what I have now;the range function is not working with this.
import struct
with open(x, "rb") as f:
this_chunk = range(50000001, 100000000)
data = f.read(this_chunk)
ints1 = struct.unpack("I" * (this_chunk //4) , data)
print(ints1)
You can use f.seek(offset) to set the file pointer to start reading from a certain offset.
In your case, you'd want to skip 5000000 bytes, so you'd call
f.seek(50000000)
At this point, you'd want to read another 50000000 bytes, so you'd call f.read(50000000).
This would be your complete code listing, implementing f.seek and reading the whole file:
with open(x, "rb") as f:
f.seek(50000000) # omit if you don't want to skip this chunk
data = f.read(50000000)
while data:
... # do something
data = f.read(50000000)
Use f.read(50000000) in a loop at it will read the file in chunks of 50000000, e.g.:
In []:
from io import StringIO
s = '''hello'''
with StringIO(s) as f:
while True:
c = f.read(2)
if not c:
break
print(c)
Out[]:
he
ll
o

mmap in python printing binary data instead of text

I am trying to read a big file of 30 MB character by character. I found an interesting article on how to read a big file. Fast Method to Stream Big files
Problem: Output printing binary data instead of actual human readable text
Code:
def getRow(filepath):
offsets = get_offsets(filepath)
random.shuffle(offsets)
with gzip.open(filepath, "r+b") as f:
i = 0
mm = mmap.mmap(f.fileno(), 0, access = mmap.ACCESS_READ)
for position in offsets:
mm.seek(position)
record = mm.readline()
x = record.split(",")
yield x
def get_offsets(input_filename):
offsets = []
with open(input_filename, 'r+b') as f:
i = 0
mm = mmap.mmap(f.fileno(), 0, access = mmap.ACCESS_READ)
for record in iter(mm.readline, ''):
loc = mm.tell()
offsets.append(loc)
i += 1
return offsets
for line in getRow("hello.dat.gz"):
print line
Output: The output is producing some weird binary data.
['w\xc1\xd9S\xabP8xy\x8f\xd8\xae\xe3\xd8b&\xb6"\xbeZ\xf3P\xdc\x19&H\\#\x8e\x83\x0b\x81?R\xb0\xf2\xb5\xc1\x88rJ\
Am I doing something terribly stupid?
EDIT:
I found the problem. It is because of gzip.open. Not sure how to get rid of this. Any ideas?
As per the documentation of GZipFile:
fileno(self)
Invoke the underlying file object's `fileno()` method.
You are mapping a view of the compressed .gz file, not a view of the compressed data.
mmap() can only operate on OS file handles, it cannot map arbitrary Python file objects.
So no, you cannot transparently map a decompressed view of a compressed file unless this is supported directly by the underlying operating system.

How to extract multiple JSON objects from one file?

I am very new to Json files. If I have a json file with multiple json objects such as following:
{"ID":"12345","Timestamp":"20140101", "Usefulness":"Yes",
"Code":[{"event1":"A","result":"1"},…]}
{"ID":"1A35B","Timestamp":"20140102", "Usefulness":"No",
"Code":[{"event1":"B","result":"1"},…]}
{"ID":"AA356","Timestamp":"20140103", "Usefulness":"No",
"Code":[{"event1":"B","result":"0"},…]}
…
I want to extract all "Timestamp" and "Usefulness" into a data frames:
Timestamp Usefulness
0 20140101 Yes
1 20140102 No
2 20140103 No
…
Does anyone know a general way to deal with such problems?
Update: I wrote a solution that doesn't require reading the entire file in one go. It's too big for a stackoverflow answer, but can be found here jsonstream.
You can use json.JSONDecoder.raw_decode to decode arbitarily big strings of "stacked" JSON (so long as they can fit in memory). raw_decode stops once it has a valid object and returns the last position where wasn't part of the parsed object. It's not documented, but you can pass this position back to raw_decode and it start parsing again from that position. Unfortunately, the Python json module doesn't accept strings that have prefixing whitespace. So we need to search to find the first non-whitespace part of your document.
from json import JSONDecoder, JSONDecodeError
import re
NOT_WHITESPACE = re.compile(r'\S')
def decode_stacked(document, pos=0, decoder=JSONDecoder()):
while True:
match = NOT_WHITESPACE.search(document, pos)
if not match:
return
pos = match.start()
try:
obj, pos = decoder.raw_decode(document, pos)
except JSONDecodeError:
# do something sensible if there's some error
raise
yield obj
s = """
{"a": 1}
[
1
,
2
]
"""
for obj in decode_stacked(s):
print(obj)
prints:
{'a': 1}
[1, 2]
Use a json array, in the format:
[
{"ID":"12345","Timestamp":"20140101", "Usefulness":"Yes",
"Code":[{"event1":"A","result":"1"},…]},
{"ID":"1A35B","Timestamp":"20140102", "Usefulness":"No",
"Code":[{"event1":"B","result":"1"},…]},
{"ID":"AA356","Timestamp":"20140103", "Usefulness":"No",
"Code":[{"event1":"B","result":"0"},…]},
...
]
Then import it into your python code
import json
with open('file.json') as json_file:
data = json.load(json_file)
Now the content of data is an array with dictionaries representing each of the elements.
You can access it easily, i.e:
data[0]["ID"]
So, as was mentioned in a couple comments containing the data in an array is simpler but the solution does not scale well in terms of efficiency as the data set size increases. You really should only use an iterable object when you want to access a random item in the array, otherwise, generators are the way to go. Below I have prototyped a reader function which reads each json object individually and returns a generator.
The basic idea is to signal the reader to split on the carriage character "\n" (or "\r\n" for Windows). Python can do this with the file.readline() function.
import json
def json_reader(filename):
with open(filename) as f:
for line in f:
yield json.loads(line)
However, this method only really works when the file is written as you have it -- with each object separated by a newline character. Below I wrote an example of a writer that separates an array of json objects and saves each one on a new line.
def json_writer(file, json_objects):
with open(file, "w") as f:
for jsonobj in json_objects:
jsonstr = json.dumps(jsonobj)
f.write(jsonstr + "\n")
You could also do the same operation with file.writelines() and a list comprehension:
...
json_strs = [json.dumps(j) + "\n" for j in json_objects]
f.writelines(json_strs)
...
And if you wanted to append the data instead of writing a new file just change open(file, "w") to open(file, "a").
In the end I find this helps a great deal not only with readability when I try and open json files in a text editor but also in terms of using memory more efficiently.
On that note if you change your mind at some point and you want a list out of the reader, Python allows you to put a generator function inside of a list and populate the list automatically. In other words, just write
lst = list(json_reader(file))
Added streaming support based on the answer of #dunes:
import re
from json import JSONDecoder, JSONDecodeError
NOT_WHITESPACE = re.compile(r"[^\s]")
def stream_json(file_obj, buf_size=1024, decoder=JSONDecoder()):
buf = ""
ex = None
while True:
block = file_obj.read(buf_size)
if not block:
break
buf += block
pos = 0
while True:
match = NOT_WHITESPACE.search(buf, pos)
if not match:
break
pos = match.start()
try:
obj, pos = decoder.raw_decode(buf, pos)
except JSONDecodeError as e:
ex = e
break
else:
ex = None
yield obj
buf = buf[pos:]
if ex is not None:
raise ex

How to read a simple binary file

I have a binary file that consists of consecutive binary subsequences of fixed and equal length. Each subsequence can be unpacked into the same number of values. I know the length of each subsequence and the binary format of the values.
How can I work through the binary file, chopping out the subsequences,
unpacking their content and write them out as csv as I go.
I know how to write out as csv. My problem is the reading from file and unpacking part. This is my non-working code.
import csv
import sys
import struct
writer = csv.writer(sys.stdout, delimiter=',', quoting=csv.QUOTE_NONE,escapechar='\\')
? rows = sys.stdin. ?
? header = id, time ....
? write the header with csv
i = 0
for row in rows:
unpacked_row = unpack('QqqqddiBIBcsbshlshhlQB',row)
writer.writerow(unpacked_row)
i += 1
Possible solution using Reading binary file in Python and looping over each byte and the answer of ignacio.
First calculate chunksize = struct.calcsize()
def bytes_from_file(filename, chunksize=8192):
with open(filename, "rb") as f:
while True:
chunk = f.read(chunksize)
if chunk:
yield chunk
else:
break
# example:
for chunk in bytes_from_file('filename'):
# row = unpack(chunk)
# write out row as csv
You need to calculate the size of the structure (Hint: struct.calcsize()) and read some multiple of that from the file at a time. You cannot directly iterate over the input as you can with a text file, since there is no delimiter as such.
You could use struct.Struct to unpack values from a file:
#!/usr/bin/env python
import csv
import sys
from struct import Struct
record = Struct('QqqqddiBIBcsbshlshhlQB')
with open('input_filename', 'rb') as file:
writer = csv.writer(sys.stdout, quoting=csv.QUOTE_NONE, escapechar='\\')
while True:
buf = file.read(record.size)
if len(buf) != record.size:
break
writer.writerow(record.unpack_from(buf))
You could also write the while-loop as:
from functools import partial
for buf in iter(partial(file.read, record.size), b''):
writer.writerow(record.unpack_from(buf))

Python3 (v3.2.2) extra bits when writing binary files

I have been working on function to map the bytes of a binary file to another set of bytes. I am reading from and writing to the same file. My problem is that every time i do it i end up with extra bytes unless i move to the end of the file before closing it, here is my code:
with open(self._path,'r+b') as source:
for lookAt in range(0,self._size[1]*self._size[2],1):
source.seek(lookAt*self._size[0],0)
readBuffer = array.array('B')
readBuffer.fromfile(source, self._size[0])
newLine = array.array('B',[mappingDict[mat] for mat in readBuffer])
source.seek(lookAt*self._size[0],0)
newLine.tofile(source)
source.seek(0,2) # Magic line that solves stupid bug
source.close()
I am using the array module to read and write data since i got the same problem when i used read() and write(). I do not understand why the 'Magic line' solves the problem since that's never used. I will appreciate any insight i can get on this.
Comment (answer follows):
I see the same behavior as you:
#!/usr/bin/env python3
import os
import sys
filename = '/tmp/a'
with open(filename, 'wb') as f:
f.write(b'1234a67b8ca')
print(open(filename, 'rb').read())
bufsize = 3
table = bytes.maketrans(b'abcde', b'xyzzz') # mapping
with open(filename, 'r+b') as f:
for i in range(0, os.path.getsize(filename), bufsize):
f.seek(i, os.SEEK_SET)
b = f.read(bufsize) # result shouldn't depend on it due to 1 -> 1
if not b:
break
f.seek(i, os.SEEK_SET)
f.write(b.translate(table))
f.seek(0, os.SEEK_END) # magic
print(open(filename, 'rb').read())
Output (with magic line or buffering=0 or f.flush() after f.write)
b'1234a67b8ca'
b'1234x67y8zx'
Output (without magic line)
b'1234a67b8ca'
b'1234a67b8zx1234x67y8'
Answer:
If your mapping is 1 -> 1 you could use bytes.translate():
#!/usr/bin/env python3
import io
import os
import sys
filename = '/tmp/a'
data = b'1234a67b8ca'*10000
with open(filename, 'wb') as f:
f.write(data)
assert data == open(filename, 'rb').read()
print(data[:10]+data[-10:])
bufsize = io.DEFAULT_BUFFER_SIZE
table = bytes.maketrans(b'abcde', b'xyzzz') # mapping
with open(filename, 'r+b') as f:
while True:
b = f.read(bufsize) # result shouldn't depend on bufsize due to 1 -> 1
if not b:
break
f.seek(-len(b), os.SEEK_CUR)
f.write(b.translate(table))
f.flush()
tr_data = data.translate(table)
assert tr_data == open(filename, 'rb').read()
print(tr_data[:10]+tr_data[-10:])
It seems that io.BufferedRandom can't do interlaced read/seek/write (bug in Python3) without flush().
Having experimented with this a little, I conjecture that it is a bug in Python 3.
In support of my conjecture, I offer the following code (based on #J.F. Sebastian's):
import os
import sys
filename = '/tmp/a'
with open(filename, 'wb') as f:
f.write(b'1234a67b8ca')
print(open(filename, 'rb').read())
bufsize = 3
with open(filename, 'r+b') as f:
for i in range(0, os.path.getsize(filename), bufsize):
f.seek(i, os.SEEK_SET)
b = f.read(bufsize)
f.seek(i, os.SEEK_SET)
f.write(b)
# f.seek(0, os.SEEK_END) # magic
print(open(filename, 'rb').read())
When run using Python 2.7.1, it works as you'd expect, and the magic line makes no difference.
When run using Python 3.1.2, it inexplicably requires the magic no-op seek() in order to make it work as expected.
At this point I'd suggest demonstrating the code to core Python 3 developers to get their opinion on whether this is indeed a bug.

Categories