Python3 (v3.2.2) extra bits when writing binary files - python

I have been working on function to map the bytes of a binary file to another set of bytes. I am reading from and writing to the same file. My problem is that every time i do it i end up with extra bytes unless i move to the end of the file before closing it, here is my code:
with open(self._path,'r+b') as source:
for lookAt in range(0,self._size[1]*self._size[2],1):
source.seek(lookAt*self._size[0],0)
readBuffer = array.array('B')
readBuffer.fromfile(source, self._size[0])
newLine = array.array('B',[mappingDict[mat] for mat in readBuffer])
source.seek(lookAt*self._size[0],0)
newLine.tofile(source)
source.seek(0,2) # Magic line that solves stupid bug
source.close()
I am using the array module to read and write data since i got the same problem when i used read() and write(). I do not understand why the 'Magic line' solves the problem since that's never used. I will appreciate any insight i can get on this.

Comment (answer follows):
I see the same behavior as you:
#!/usr/bin/env python3
import os
import sys
filename = '/tmp/a'
with open(filename, 'wb') as f:
f.write(b'1234a67b8ca')
print(open(filename, 'rb').read())
bufsize = 3
table = bytes.maketrans(b'abcde', b'xyzzz') # mapping
with open(filename, 'r+b') as f:
for i in range(0, os.path.getsize(filename), bufsize):
f.seek(i, os.SEEK_SET)
b = f.read(bufsize) # result shouldn't depend on it due to 1 -> 1
if not b:
break
f.seek(i, os.SEEK_SET)
f.write(b.translate(table))
f.seek(0, os.SEEK_END) # magic
print(open(filename, 'rb').read())
Output (with magic line or buffering=0 or f.flush() after f.write)
b'1234a67b8ca'
b'1234x67y8zx'
Output (without magic line)
b'1234a67b8ca'
b'1234a67b8zx1234x67y8'
Answer:
If your mapping is 1 -> 1 you could use bytes.translate():
#!/usr/bin/env python3
import io
import os
import sys
filename = '/tmp/a'
data = b'1234a67b8ca'*10000
with open(filename, 'wb') as f:
f.write(data)
assert data == open(filename, 'rb').read()
print(data[:10]+data[-10:])
bufsize = io.DEFAULT_BUFFER_SIZE
table = bytes.maketrans(b'abcde', b'xyzzz') # mapping
with open(filename, 'r+b') as f:
while True:
b = f.read(bufsize) # result shouldn't depend on bufsize due to 1 -> 1
if not b:
break
f.seek(-len(b), os.SEEK_CUR)
f.write(b.translate(table))
f.flush()
tr_data = data.translate(table)
assert tr_data == open(filename, 'rb').read()
print(tr_data[:10]+tr_data[-10:])
It seems that io.BufferedRandom can't do interlaced read/seek/write (bug in Python3) without flush().

Having experimented with this a little, I conjecture that it is a bug in Python 3.
In support of my conjecture, I offer the following code (based on #J.F. Sebastian's):
import os
import sys
filename = '/tmp/a'
with open(filename, 'wb') as f:
f.write(b'1234a67b8ca')
print(open(filename, 'rb').read())
bufsize = 3
with open(filename, 'r+b') as f:
for i in range(0, os.path.getsize(filename), bufsize):
f.seek(i, os.SEEK_SET)
b = f.read(bufsize)
f.seek(i, os.SEEK_SET)
f.write(b)
# f.seek(0, os.SEEK_END) # magic
print(open(filename, 'rb').read())
When run using Python 2.7.1, it works as you'd expect, and the magic line makes no difference.
When run using Python 3.1.2, it inexplicably requires the magic no-op seek() in order to make it work as expected.
At this point I'd suggest demonstrating the code to core Python 3 developers to get their opinion on whether this is indeed a bug.

Related

Count the number of vowels in a file [duplicate]

In Python, given the name of a file, how can I write a loop that reads one character each time through the loop?
with open(filename) as f:
while True:
c = f.read(1)
if not c:
print("End of file")
break
print("Read a character:", c)
First, open a file:
with open("filename") as fileobj:
for line in fileobj:
for ch in line:
print(ch)
This goes through every line in the file and then every character in that line.
I like the accepted answer: it is straightforward and will get the job done. I would also like to offer an alternative implementation:
def chunks(filename, buffer_size=4096):
"""Reads `filename` in chunks of `buffer_size` bytes and yields each chunk
until no more characters can be read; the last chunk will most likely have
less than `buffer_size` bytes.
:param str filename: Path to the file
:param int buffer_size: Buffer size, in bytes (default is 4096)
:return: Yields chunks of `buffer_size` size until exhausting the file
:rtype: str
"""
with open(filename, "rb") as fp:
chunk = fp.read(buffer_size)
while chunk:
yield chunk
chunk = fp.read(buffer_size)
def chars(filename, buffersize=4096):
"""Yields the contents of file `filename` character-by-character. Warning:
will only work for encodings where one character is encoded as one byte.
:param str filename: Path to the file
:param int buffer_size: Buffer size for the underlying chunks,
in bytes (default is 4096)
:return: Yields the contents of `filename` character-by-character.
:rtype: char
"""
for chunk in chunks(filename, buffersize):
for char in chunk:
yield char
def main(buffersize, filenames):
"""Reads several files character by character and redirects their contents
to `/dev/null`.
"""
for filename in filenames:
with open("/dev/null", "wb") as fp:
for char in chars(filename, buffersize):
fp.write(char)
if __name__ == "__main__":
# Try reading several files varying the buffer size
import sys
buffersize = int(sys.argv[1])
filenames = sys.argv[2:]
sys.exit(main(buffersize, filenames))
The code I suggest is essentially the same idea as your accepted answer: read a given number of bytes from the file. The difference is that it first reads a good chunk of data (4006 is a good default for X86, but you may want to try 1024, or 8192; any multiple of your page size), and then it yields the characters in that chunk one by one.
The code I present may be faster for larger files. Take, for example, the entire text of War and Peace, by Tolstoy. These are my timing results (Mac Book Pro using OS X 10.7.4; so.py is the name I gave to the code I pasted):
$ time python so.py 1 2600.txt.utf-8
python so.py 1 2600.txt.utf-8 3.79s user 0.01s system 99% cpu 3.808 total
$ time python so.py 4096 2600.txt.utf-8
python so.py 4096 2600.txt.utf-8 1.31s user 0.01s system 99% cpu 1.318 total
Now: do not take the buffer size at 4096 as a universal truth; look at the results I get for different sizes (buffer size (bytes) vs wall time (sec)):
2 2.726
4 1.948
8 1.693
16 1.534
32 1.525
64 1.398
128 1.432
256 1.377
512 1.347
1024 1.442
2048 1.316
4096 1.318
As you can see, you can start seeing gains earlier on (and my timings are likely very inaccurate); the buffer size is a trade-off between performance and memory. The default of 4096 is just a reasonable choice but, as always, measure first.
Just:
myfile = open(filename)
onecharacter = myfile.read(1)
Python itself can help you with this, in interactive mode:
>>> help(file.read)
Help on method_descriptor:
read(...)
read([size]) -> read at most size bytes, returned as a string.
If the size argument is negative or omitted, read until EOF is reached.
Notice that when in non-blocking mode, less data than what was requested
may be returned, even if no size parameter was given.
I learned a new idiom for this today while watching Raymond Hettinger's Transforming Code into Beautiful, Idiomatic Python:
import functools
with open(filename) as f:
f_read_ch = functools.partial(f.read, 1)
for ch in iter(f_read_ch, ''):
print 'Read a character:', repr(ch)
Just read a single character
f.read(1)
This will also work:
with open("filename") as fileObj:
for line in fileObj:
for ch in line:
print(ch)
It goes through every line in the the file and every character in every line.
(Note that this post now looks extremely similar to a highly upvoted answer, but this was not the case at the time of writing.)
Best answer for Python 3.8+:
with open(path, encoding="utf-8") as f:
while c := f.read(1):
do_my_thing(c)
You may want to specify utf-8 and avoid the platform encoding. I've chosen to do that here.
Function – Python 3.8+:
def stream_file_chars(path: str):
with open(path) as f:
while c := f.read(1):
yield c
Function – Python<=3.7:
def stream_file_chars(path: str):
with open(path, encoding="utf-8") as f:
while True:
c = f.read(1)
if c == "":
break
yield c
Function – pathlib + documentation:
from pathlib import Path
from typing import Union, Generator
def stream_file_chars(path: Union[str, Path]) -> Generator[str, None, None]:
"""Streams characters from a file."""
with Path(path).open(encoding="utf-8") as f:
while (c := f.read(1)) != "":
yield c
You should try f.read(1), which is definitely correct and the right thing to do.
f = open('hi.txt', 'w')
f.write('0123456789abcdef')
f.close()
f = open('hej.txt', 'r')
f.seek(12)
print f.read(1) # This will read just "c"
To make a supplement,
if you are reading file that contains a line that is vvvvery huge, which might break your memory, you might consider read them into a buffer then yield the each char
def read_char(inputfile, buffersize=10240):
with open(inputfile, 'r') as f:
while True:
buf = f.read(buffersize)
if not buf:
break
for char in buf:
yield char
yield '' #handle the scene that the file is empty
if __name__ == "__main__":
for word in read_char('./very_large_file.txt'):
process(char)
os.system("stty -icanon -echo")
while True:
raw_c = sys.stdin.buffer.peek()
c = sys.stdin.read(1)
print(f"Char: {c}")
Combining qualities of some other answers, here is something that is invulnerable to long files / lines, while being more succinct and faster:
import functools as ft, itertools as it
with open(path) as f:
for c in it.chain.from_iterable(
iter(ft.partial(f.read, 4096), '')
):
print(c)
#reading out the file at once in a list and then printing one-by-one
f=open('file.txt')
for i in list(f.read()):
print(i)

Reading first lines of bz2 files in python

I am trying to extract 10'000 first lines from a bz2 file.
import bz2
file = "file.bz2"
file_10000 = "file.txt"
output_file = codecs.open(file_10000,'w+','utf-8')
source_file = bz2.open(file, "r")
count = 0
for line in source_file:
count += 1
if count < 10000:
output_file.writerow(line)
But I get an error "'module' object has no attribute 'open'". Do you have any ideas? Or may be I could save 10'000 first lines to a txt file in some other way? I am on Windows.
Here is a fully working example that includes writing and reading a test file that is much smaller than your 10000 lines. Its nice to have working examples in questions so we can test easily.
import bz2
import itertools
import codecs
file = "file.bz2"
file_10000 = "file.txt"
# write test file with 9 lines
with bz2.BZ2File(file, "w") as fp:
fp.write('\n'.join('123456789'))
# the original script using BZ2File ... and 3 lines for test
# ...and fixing bugs:
# 1) it only writes 9999 instead of 10000
# 2) files don't do writerow
# 3) close the files
output_file = codecs.open(file_10000,'w+','utf-8')
source_file = bz2.BZ2File(file, "r")
count = 0
for line in source_file:
count += 1
if count <= 3:
output_file.write(line)
source_file.close()
output_file.close()
# show what you got
print('---- Test 1 ----')
print(repr(open(file_10000).read()))
A more efficient way to do it is to break out of the for loop after reading the lines you want. you can even leverage iterators to thin up the code like so:
# a faster way to read first 3 lines
with bz2.BZ2File(file) as source_file,\
codecs.open(file_10000,'w+','utf-8') as output_file:
output_file.writelines(itertools.islice(source_file, 3))
# show what you got
print('---- Test 2 ----')
print(repr(open(file_10000).read()))
This is definitely a simpler way of doing it than the other answer, but it would be an easy way to do so in both Python2/3. Also, it would short-circuit if you don't have >= 10,000 lines.
from bz2 import BZ2File as bzopen
# writing to a file
with bzopen("file.bz2", "w") as bzfout:
for i in range(123456):
bzfout.write(b"%i\n" % i)
# reading a bz2 archive
with bzopen("file.bz2", "r") as bzfin:
""" Handle lines here """
lines = []
for i, line in enumerate(bzfin):
if i == 10000: break
lines.append(line.rstrip())
print(lines)
Just another variation.
import bz2
myfile = 'c:\\my_dir\\random.txt.bz2'
newfile = 'c:\\my_dir\\random_10000.txt'
stream = bz2.BZ2File(myfile)
with open(newfile, 'w') as f:
for i in range(1,10000):
f.write(stream.readline())
This worked for me:
sudo apt-get install python-dev
sudo pip install backports.lzma

writing data into file with binary packed format in python

I am reading some value for file and wants to write modified value into file. My file is .ktx format [binary packed format].
I am using struct.pack() but seems that something is going wrong with that:
bytes = file.read(4)
bytesAsInt = struct.unpack("l",bytes)
number=1+(bytesAsInt[0])
number=hex(number)
no=struct.pack("1",number)
outfile.write(no)
I want to write in both ways little-endian and big-endian.
no_little =struct.pack(">1",bytesAsInt)
no_big =struct.pack("<1",bytesAsInt) # i think this is default ...
again you can check the docs and see the format characters you need
https://docs.python.org/3/library/struct.html
>>> struct.unpack("l","\x05\x04\x03\03")
(50529285,)
>>> struct.pack("l",50529285)
'\x05\x04\x03\x03'
>>> struct.pack("<l",50529285)
'\x05\x04\x03\x03'
>>> struct.pack(">l",50529285)
'\x03\x03\x04\x05'
also note that it is a lowercase L , not a one (as also covered in the docs)
I haven't tested this but the following function should solve your problem. At the moment it reads the file contents completely, creates a buffer and then writes out the updated contents. You could also modify the file buffer directly using unpack_from and pack_into but it might be slower (again, not tested). I'm using the struct.Struct class since you seem to want to unpack the same number many times.
import os
import struct
from StringIO import StringIO
def modify_values(in_file, out_file, increment=1, num_code="i", endian="<"):
with open(in_file, "rb") as file_h:
content = file_h.read()
num = struct.Struct(endian + num_code)
buf = StringIO()
try:
while len(content) >= num.size:
value = num.unpack(content[:num.size])[0]
value += increment
buf.write(num.pack(value))
content = content[num.size:]
except Exception as err:
# handle
else:
buf.seek(0)
with open(out_file, "wb") as file_h:
file_h.write(buf.read())
An alternative is to use the array which makes it quite easy. I don't know how to implement endianess with an array.
def modify_values(filename, increment=1, num_code="i"):
with open(filename, "rb") as file_h:
arr = array("i", file_h.read())
for i in range(len(arr)):
arr[i] += increment
with open(filename, "wb") as file_h:
arr.tofile(file_h)

read whole file at once

I need to read whole source data from file something.zip (not uncompress it)
I tried
f = open('file.zip')
s = f.read()
f.close()
return s
but it returns only few bytes and not whole source data. Any idea how to achieve it? Thanks
Use binary mode(b) when you're dealing with binary file.
def read_zipfile(path):
with open(path, 'rb') as f:
return f.read()
BTW, use with statement instead of manual close.
As mentioned there is an EOF character (0x1A) that terminates the .read() operation. To reproduce this and demonstrate:
# Create file of 256 bytes
with open('testfile', 'wb') as fout:
fout.write(''.join(map(chr, range(256))))
# Text mode
with open('testfile') as fin:
print 'Opened in text mode is:', len(fin.read())
# Opened in text mode is: 26
# Binary mode - note 'rb'
with open('testfile', 'rb') as fin:
print 'Opened in binary mode is:', len(fin.read())
# Opened in binary mode is: 256
This should do it:
In [1]: f = open('/usr/bin/ping', 'rb')
In [2]: bytes = f.read()
In [3]: len(bytes)
Out[3]: 9728
For comparison, here's the file I opened in the code above:
-rwx------+ 1 xx yy 9.5K Jan 19 2005 /usr/bin/ping*

How can I read a single character at a time from a file in Python?

In Python, given the name of a file, how can I write a loop that reads one character each time through the loop?
with open(filename) as f:
while True:
c = f.read(1)
if not c:
print("End of file")
break
print("Read a character:", c)
First, open a file:
with open("filename") as fileobj:
for line in fileobj:
for ch in line:
print(ch)
This goes through every line in the file and then every character in that line.
I like the accepted answer: it is straightforward and will get the job done. I would also like to offer an alternative implementation:
def chunks(filename, buffer_size=4096):
"""Reads `filename` in chunks of `buffer_size` bytes and yields each chunk
until no more characters can be read; the last chunk will most likely have
less than `buffer_size` bytes.
:param str filename: Path to the file
:param int buffer_size: Buffer size, in bytes (default is 4096)
:return: Yields chunks of `buffer_size` size until exhausting the file
:rtype: str
"""
with open(filename, "rb") as fp:
chunk = fp.read(buffer_size)
while chunk:
yield chunk
chunk = fp.read(buffer_size)
def chars(filename, buffersize=4096):
"""Yields the contents of file `filename` character-by-character. Warning:
will only work for encodings where one character is encoded as one byte.
:param str filename: Path to the file
:param int buffer_size: Buffer size for the underlying chunks,
in bytes (default is 4096)
:return: Yields the contents of `filename` character-by-character.
:rtype: char
"""
for chunk in chunks(filename, buffersize):
for char in chunk:
yield char
def main(buffersize, filenames):
"""Reads several files character by character and redirects their contents
to `/dev/null`.
"""
for filename in filenames:
with open("/dev/null", "wb") as fp:
for char in chars(filename, buffersize):
fp.write(char)
if __name__ == "__main__":
# Try reading several files varying the buffer size
import sys
buffersize = int(sys.argv[1])
filenames = sys.argv[2:]
sys.exit(main(buffersize, filenames))
The code I suggest is essentially the same idea as your accepted answer: read a given number of bytes from the file. The difference is that it first reads a good chunk of data (4006 is a good default for X86, but you may want to try 1024, or 8192; any multiple of your page size), and then it yields the characters in that chunk one by one.
The code I present may be faster for larger files. Take, for example, the entire text of War and Peace, by Tolstoy. These are my timing results (Mac Book Pro using OS X 10.7.4; so.py is the name I gave to the code I pasted):
$ time python so.py 1 2600.txt.utf-8
python so.py 1 2600.txt.utf-8 3.79s user 0.01s system 99% cpu 3.808 total
$ time python so.py 4096 2600.txt.utf-8
python so.py 4096 2600.txt.utf-8 1.31s user 0.01s system 99% cpu 1.318 total
Now: do not take the buffer size at 4096 as a universal truth; look at the results I get for different sizes (buffer size (bytes) vs wall time (sec)):
2 2.726
4 1.948
8 1.693
16 1.534
32 1.525
64 1.398
128 1.432
256 1.377
512 1.347
1024 1.442
2048 1.316
4096 1.318
As you can see, you can start seeing gains earlier on (and my timings are likely very inaccurate); the buffer size is a trade-off between performance and memory. The default of 4096 is just a reasonable choice but, as always, measure first.
Just:
myfile = open(filename)
onecharacter = myfile.read(1)
Python itself can help you with this, in interactive mode:
>>> help(file.read)
Help on method_descriptor:
read(...)
read([size]) -> read at most size bytes, returned as a string.
If the size argument is negative or omitted, read until EOF is reached.
Notice that when in non-blocking mode, less data than what was requested
may be returned, even if no size parameter was given.
I learned a new idiom for this today while watching Raymond Hettinger's Transforming Code into Beautiful, Idiomatic Python:
import functools
with open(filename) as f:
f_read_ch = functools.partial(f.read, 1)
for ch in iter(f_read_ch, ''):
print 'Read a character:', repr(ch)
Just read a single character
f.read(1)
This will also work:
with open("filename") as fileObj:
for line in fileObj:
for ch in line:
print(ch)
It goes through every line in the the file and every character in every line.
(Note that this post now looks extremely similar to a highly upvoted answer, but this was not the case at the time of writing.)
Best answer for Python 3.8+:
with open(path, encoding="utf-8") as f:
while c := f.read(1):
do_my_thing(c)
You may want to specify utf-8 and avoid the platform encoding. I've chosen to do that here.
Function – Python 3.8+:
def stream_file_chars(path: str):
with open(path) as f:
while c := f.read(1):
yield c
Function – Python<=3.7:
def stream_file_chars(path: str):
with open(path, encoding="utf-8") as f:
while True:
c = f.read(1)
if c == "":
break
yield c
Function – pathlib + documentation:
from pathlib import Path
from typing import Union, Generator
def stream_file_chars(path: Union[str, Path]) -> Generator[str, None, None]:
"""Streams characters from a file."""
with Path(path).open(encoding="utf-8") as f:
while (c := f.read(1)) != "":
yield c
You should try f.read(1), which is definitely correct and the right thing to do.
f = open('hi.txt', 'w')
f.write('0123456789abcdef')
f.close()
f = open('hej.txt', 'r')
f.seek(12)
print f.read(1) # This will read just "c"
To make a supplement,
if you are reading file that contains a line that is vvvvery huge, which might break your memory, you might consider read them into a buffer then yield the each char
def read_char(inputfile, buffersize=10240):
with open(inputfile, 'r') as f:
while True:
buf = f.read(buffersize)
if not buf:
break
for char in buf:
yield char
yield '' #handle the scene that the file is empty
if __name__ == "__main__":
for word in read_char('./very_large_file.txt'):
process(char)
os.system("stty -icanon -echo")
while True:
raw_c = sys.stdin.buffer.peek()
c = sys.stdin.read(1)
print(f"Char: {c}")
Combining qualities of some other answers, here is something that is invulnerable to long files / lines, while being more succinct and faster:
import functools as ft, itertools as it
with open(path) as f:
for c in it.chain.from_iterable(
iter(ft.partial(f.read, 4096), '')
):
print(c)
#reading out the file at once in a list and then printing one-by-one
f=open('file.txt')
for i in list(f.read()):
print(i)

Categories