Python Read Certain Number of Bytes After Character - python

I'm dealing with a character separated hex file, where each field has a particular start code. I've opened the file as 'rb', but I was wondering, after I get the index of the startcode using .find, how do I read a certain number of bytes from this position?
This is how I am loading the file and what I am attempting to do
with open(someFile, 'rb') as fileData:
startIndex = fileData.find('(G')
data = fileData[startIndex:7]
where 7 is the number of bytes I want to read from the index returned by the find function. I am using python 2.7.3

You can get the position of a substring in a bytestring under python2.7 like this:
>>> with open('student.txt', 'rb') as f:
... data = f.read()
...
>>> data # holds the French word for student: élève
'\xc3\xa9l\xc3\xa8ve\n'
>>> len(data) # this shows we are dealing with bytes here, because "élève\n" would be 6 characters long, had it been properly decoded!
8
>>> len(data.decode('utf-8'))
6
>>> data.find('\xa8') # continue with the bytestring...
4
>>> bytes_to_read = 3
>>> data[4:4+bytes_to_read]
'\xa8ve'
You can look for the special characters, and for compatibility with Python3k, it's better if you prepend the character with a b, indicating these are bytes (in Python2.x, it will work without though):
>>> data.find(b'è') # in python2.x this works too (unfortunately, because it has lead to a lot of confusion): data.find('è')
3
>>> bytes_to_read = 3
>>> pos = data.find(b'è')
>>> data[pos:pos+bytes_to_read] # when you use the syntax 'n:m', it will read bytes in a bytestring
'\xc3\xa8v'
>>>

Related

How to decode longest sub-bytes into str?

Suppose I read a long bytes object from somewhere, knowing it is utf-8 encoded. But the read may not fully consume the available content so that the last character in the stream may be incomplete. Calling bytes.decode() on this object may result in a decode error. But what really fails is only the last few bytes. Is there a function that works in this case, returning the longest decoded string and the remaining bytes?
utf-8 encodes a character into at most 4 bytes, so trying to decode truncated bytes should work, but a vast majority of computation will be wasted, and I don't really like this solution.
To give a simple but concrete example:
>>> b0 = b'\xc3\x84\xc3\x96\xc3'
>>> b1 = b'\x9c\xc3\x84\xc3\x96\xc3\x9c'
>>> (b0 + b1).decode()
>>> 'ÄÖÜÄÖÜ'
(b0 + b1).decode() is fine, but b0.decode() will raise. The solution should be able to decode b0 for as much as possible and return the bytes that cannot be decoded.
You are describing the basic usage of io.TextIOWrapper: a buffered text stream over a binary stream.
>>> import io
>>> txt = 'before\N{PILE OF POO}after'
>>> b = io.BytesIO(txt.encode('utf-8'))
>>> t = io.TextIOWrapper(b)
>>> t.read(5)
'befor'
>>> t.read(1)
'e'
>>> t.read(1)
'💩'
>>> t.read(1)
'a'
Contrast with reading a bytes stream directly, where it would be possible to read halfway through an encoded pile of poo:
>>> b.seek(0)
0
>>> b.read(5)
b'befor'
>>> b.read(1)
b'e'
>>> b.read(1)
b'\xf0'
>>> b.read(1)
b'\x9f'
>>> b.read(1)
b'\x92'
>>> b.read(1)
b'\xa9'
>>> b.read(1)
b'a'
Specify encoding="utf-8" if you want to be explicit. The default encoding, i.e. locale.getpreferredencoding(False), would usually be utf-8 anyway.
As I mentioned in the comments under #wim's answer, I think you could use the codecs.iterdecode() incremental decoder to do this. Since it's a generator function, there's no need to manually save and restore its state between iterative calls to it.
Here's how how it might be used to handle a situation like the one you described:
import codecs
from random import randint
def reader(sequence):
""" Yield random length chunks of sequence until exhausted. """
plural = lambda word, n, ending='s': (word+ending) if n > 1 else word
i = 0
while i < len(sequence):
size = randint(1, 4)
chunk = sequence[i: i+size]
hexrepr = '0x' + ''.join('%02X' % b for b in chunk)
print('read {} {}: {}'.format(size, plural('byte', len(chunk)), hexrepr))
yield chunk
i += size
bytes_obj = b'\xc3\x84\xc3\x96\xc3\x9c\xc3\x84\xc3\x96\xc3\x9c' # 'ÄÖÜÄÖÜ'
for decoded in codecs.iterdecode(reader(bytes_obj), 'utf-8'):
print(decoded)
Sample output:
read 3 bytes: 0xC384C3
Ä
read 1 byte: 0x96
Ö
read 1 byte: 0xC3
read 3 bytes: 0x9CC384
ÜÄ
read 2 bytes: 0xC396
Ö
read 4 bytes: 0xC39C
Ü

How to change the bytes in a file?

I'm making a encryption program and I need to open file in binary mode to access non-ascii and non-printable characters, I need to check if character from a file is letter, number, symbol or unprintable character. That means I have to check 1 by 1 if bytes (when they are decoded to ascii) match any of these characters:
{^9,dzEV=Q4ciT+/s};fnq3BFh% #2!k7>YSU<GyD\I]|OC_e.W0M~ua-jR5lv1wA`#8t*xr'K"[P)&b:g$p(mX6Ho?JNZL
I think I could encode these characters above to binary and then compare them with bytes. I don't know how to do this.
P.S. Sorry for bad English and binary misunderstanding. (I hope you
know what I mean by bytes, I mean characters in binary mode like
this):
\x01\x00\x9a\x9c\x18\x00
There are two major string types in Python: bytestrings (a sequence of bytes) that represent binary data and Unicode strings (a sequence of Unicode codepoints) that represent human-readable text. It is simple to convert one into another (☯):
unicode_text = bytestring.decode(character_encoding)
bytestring = unicode_text.encode(character_encoding)
If you open a file in binary mode e.g., 'rb' then file.read() returns a bytestring (bytes type):
>>> b'A' == b'\x41' == chr(0b1000001).encode()
True
There are several methods that can be used to classify bytes:
string methods such as bytes.isdigit():
>>> b'1'.isdigit()
True
string constants such as string.printable
>>> import string
>>> b'!' in string.printable.encode()
True
regular expressions such as \d
>>> import re
>>> bool(re.match(br'\d+$', b'123'))
True
classification functions in curses.ascii module e.g., curses.ascii.isprint()
>>> from curses import ascii
>>> bytearray(filter(ascii.isprint, b'123'))
bytearray(b'123')
bytearray is a mutable sequence of bytes — unlike a bytestring you can change it inplace e.g., to lowercase every 3rd byte that is uppercase:
>>> import string
>>> a = bytearray(b'ABCDEF_')
>>> uppercase = string.ascii_uppercase.encode()
>>> a[::3] = [b | 0b0100000 if b in uppercase else b
... for b in a[::3]]
>>> a
bytearray(b'aBCdEF_')
Notice: b'ad' are lowercase but b'_' remained the same.
To modify a binary file inplace, you could use mmap module e.g., to lowercase 4th column in every other line in 'file':
#!/usr/bin/env python3
import mmap
import string
uppercase = string.ascii_uppercase.encode()
ncolumn = 3 # select 4th column
with open('file', 'r+b') as file, \
mmap.mmap(file.fileno(), 0, access=mmap.ACCESS_WRITE) as mm:
while True:
mm.readline() # ignore every other line
pos = mm.tell() # remember current position
if not mm.readline(): # EOF
break
if mm[pos + ncolumn] in uppercase:
mm[pos + ncolumn] |= 0b0100000 # lowercase
Note: Python 2 and 3 APIs differ in this case. The code uses Python 3.
Input
ABCDE1
FGHIJ
ABCDE
FGHI
Output
ABCDE1
FGHiJ
ABCDE
FGHi
Notice: 4th column became lowercase on 2nd and 4h lines.
Typically if you want to change a file: you read from the file, write modifications to a temporary file, and on success you move the temporary file inplace of the original file:
#!/usr/bin/env python3
import os
import string
from tempfile import NamedTemporaryFile
caesar_shift = 3
filename = 'file'
def caesar_bytes(plaintext, shift, alphabet=string.ascii_lowercase.encode()):
shifted_alphabet = alphabet[shift:] + alphabet[:shift]
return plaintext.translate(plaintext.maketrans(alphabet, shifted_alphabet))
dest_dir = os.path.dirname(filename)
chunksize = 1 << 15
with open(filename, 'rb') as file, \
NamedTemporaryFile('wb', dir=dest_dir, delete=False) as tmp_file:
while True: # encrypt
chunk = file.read(chunksize)
if not chunk: # EOF
break
tmp_file.write(caesar_bytes(chunk, caesar_shift))
os.replace(tmp_file.name, filename)
Input
abc
def
ABC
DEF
Output
def
ghi
ABC
DEF
To convert the output back, set caesar_shift = -3.
To open a file in binary mode you use the open("filena.me", "rb") command. I've never used the command personally, but that should get you the information you need.

Split array byte string in Python

I'm trying to split a string of bytes like this:
'\xf0\x9f\x98\x84 \xf0\x9f\x98\x83 \xf0\x9f\x98\x80 \xf0\x9f\x98\x8a \xe2\x98\xba \xf0\x9f\x98\x89 \xf0\x9f\x98\x8d \xf0\x9f\x98\x98 \xf0\x9f\x98\x9a \xf0\x9f\x98\x97 \xf0\x9f\x98\x99 \xf0\x9f\x98\x9c \xf0\x9f\x98\x9d \xf0\x9f\x98\x9b \xf0\x9f\x98\x81 \xf0\x9f\x98\x82 \xf0\x9f\x98\x85 \xf0\x9f\x98\x86 \xf0\x9f\x98\x8b \xf0\x9f\x98\x8e \xf0\x9f\x98\xac \xf0\x9f\x98\x87'
into something like this:
'\xf0\x9f\x98\x84', '\xf0\x9f\x98\x83', etc.
However, the split() method returns me something like this:
'xf0', 'x9f' 'x98' etc.
I tried split(" "), but it does not seem to work. How do I achieve the above mentioned?
str.split(' ') or even just str.split() (split on arbitrary-width whitespace) works just fine on your input:
sample = '\xf0\x9f\x98\x84 \xf0\x9f\x98\x83 \xf0\x9f\x98\x80 \xf0\x9f\x98\x8a \xe2\x98\xba \xf0\x9f\x98\x89 \xf0\x9f\x98\x8d \xf0\x9f\x98\x98 \xf0\x9f\x98\x9a \xf0\x9f\x98\x97 \xf0\x9f\x98\x99 \xf0\x9f\x98\x9c \xf0\x9f\x98\x9d \xf0\x9f\x98\x9b \xf0\x9f\x98\x81 \xf0\x9f\x98\x82 \xf0\x9f\x98\x85 \xf0\x9f\x98\x86 \xf0\x9f\x98\x8b \xf0\x9f\x98\x8e \xf0\x9f\x98\xac \xf0\x9f\x98\x87'
parts = sample.split()
Demo:
>>> sample = '\xf0\x9f\x98\x84 \xf0\x9f\x98\x83 \xf0\x9f\x98\x80 \xf0\x9f\x98\x8a \xe2\x98\xba \xf0\x9f\x98\x89 \xf0\x9f\x98\x8d \xf0\x9f\x98\x98 \xf0\x9f\x98\x9a \xf0\x9f\x98\x97 \xf0\x9f\x98\x99 \xf0\x9f\x98\x9c \xf0\x9f\x98\x9d \xf0\x9f\x98\x9b \xf0\x9f\x98\x81 \xf0\x9f\x98\x82 \xf0\x9f\x98\x85 \xf0\x9f\x98\x86 \xf0\x9f\x98\x8b \xf0\x9f\x98\x8e \xf0\x9f\x98\xac \xf0\x9f\x98\x87'
>>> sample.split()
['\xf0\x9f\x98\x84', '\xf0\x9f\x98\x83', '\xf0\x9f\x98\x80', '\xf0\x9f\x98\x8a', '\xe2\x98\xba', '\xf0\x9f\x98\x89', '\xf0\x9f\x98\x8d', '\xf0\x9f\x98\x98', '\xf0\x9f\x98\x9a', '\xf0\x9f\x98\x97', '\xf0\x9f\x98\x99', '\xf0\x9f\x98\x9c', '\xf0\x9f\x98\x9d', '\xf0\x9f\x98\x9b', '\xf0\x9f\x98\x81', '\xf0\x9f\x98\x82', '\xf0\x9f\x98\x85', '\xf0\x9f\x98\x86', '\xf0\x9f\x98\x8b', '\xf0\x9f\x98\x8e', '\xf0\x9f\x98\xac', '\xf0\x9f\x98\x87']
However, if this is binary data, you need to be careful that there are no \x20 bytes in those 4-byte values. It might be better to just produce chunks of 5 bytes from this, then remove the last byte:
for i in range(0, len(sample), 5):
chunk = sample[i:i + 4] # ignore the 5th byte, a space
Demo:
>>> for i in range(0, len(sample), 5):
... chunk = sample[i:i + 4] # ignore the 5th byte, a space
... print chunk.decode('utf8')
... if i == 20: break
...
😄
😃
😀
😊
# On browsers that support it, those are various smiling emoji

How to get bytes of image in StringIO container

I am am getting an image from email attachment and will never touch disk. Image will be placed into a StringIO container and processed by PIL. How do I get the file size in bytes?
image_file = StringIO('image from email')
im = Image.open(image_file)
Use StringIO's .tell() method by seeking to the end of the file:
>>> from StringIO import StringIO
>>> s = StringIO("foobar")
>>> s.tell()
0
>>> s.seek(0, 2)
>>> s.tell()
6
In your case:
image_file = StringIO('image from email')
image_file.seek(0, 2) # Seek to the end
bytes = image_file.tell() # Get no. of bytes
image_file.seek(0) # Seek to the start
im = Image.open(image_file)
Suppose you have:
>>> s = StringIO("cat\u2014jack")
>>> s.seek(0, os.SEEK_END)
>>> print(s.tell())
8
This however is incorrect. \u2014 is an emdash character - a single character but it is 3 bytes long.
>>> len("\u2014")
1
>>> len("\u2014".encode("utf-8"))
3
StringIO may not use utf-8 for storage either. It used to use UCS-2 or UCS-4 but since pep 393 may be using utf-8 or some other encoding depending on the circumstance.
What ultimately matters is the binary representation you ultimately go with. If you're encoding text, and you'll eventually write the file out encoded as utf-8 then you have to encode the value in it's entirety to know how many bytes it will take up. This is because as a variable-length encoding, utf-8 characters may require multiple bytes to encode.
You could do something like:
>>> s = StringIO("cat\u2014jack")
>>> size = len(s.getvalue().encode('utf-8'))
10

Is this a sensible approach for an EBCDIC (CP500) to Latin-1 converter?

I have to convert a number of large files (up to 2GB) of EBCDIC 500 encoded files to Latin-1. Since I could only find EBCDIC to ASCII converters (dd, recode) and the files contain some additional proprietary character codes, I thought I'd write my own converter.
I have the character mapping so I'm interested in the technical aspects.
This is my approach so far:
# char mapping lookup table
EBCDIC_TO_LATIN1 = {
0xC1:'41', # A
0xC2:'42', # B
# and so on...
}
BUFFER_SIZE = 1024 * 64
ebd_file = file(sys.argv[1], 'rb')
latin1_file = file(sys.argv[2], 'wb')
buffer = ebd_file.read(BUFFER_SIZE)
while buffer:
latin1_file.write(ebd2latin1(buffer))
buffer = ebd_file.read(BUFFER_SIZE)
ebd_file.close()
latin1_file.close()
This is the function that does the converting:
def ebd2latin1(ebcdic):
result = []
for ch in ebcdic:
result.append(EBCDIC_TO_LATIN1[ord(ch)])
return ''.join(result).decode('hex')
The question is whether or not this is a sensible approach from an engineering standpoint. Does it have some serious design issues? Is the buffer size OK? And so on...
As for the "proprietary characters" that some don't believe in: Each file contains a year's worth of patent documents in SGML format. The patent office has been using EBCDIC until they switched to Unicode in 2005. So there are thousands of documents within each file. They are separated by some hex values that are not part of any IBM specification. They were added by the patent office. Also, at the beginning of each file there are a few digits in ASCII that tell you about the length of the file. I don't really need that information but if I want to process the file so I have to deal with them.
Also:
$ recode IBM500/CR-LF..Latin1 file.ebc
recode: file.ebc failed: Ambiguous output in step `CR-LF..data'
Thanks for the help so far.
EBCDIC 500, aka Code Page 500, is amongst Pythons encodings, although you link to cp1047, which doesn't. Which one are you using, really? Anyway this works for cp500 (or any other encoding that you have).
from __future__ import with_statement
import sys
from contextlib import nested
BUFFER_SIZE = 16384
with nested(open(sys.argv[1], 'rb'), open(sys.argv[2], 'wb')) as (infile, outfile):
while True:
buffer = infile.read(BUFFER_SIZE)
if not buffer:
break
outfile.write(buffer.decode('cp500').encode('latin1'))
This way you shouldn't need to keep track of the mappings yourself.
If you set up the table correctly, then you just need to do:
translated_chars = ebcdic.translate(EBCDIC_TO_LATIN1)
where ebcdic contains EBCDIC characters and EBCDIC_TO_LATIN1 is a 256-char string which maps each EBCDIC character to its Latin-1 equivalent. The characters in EBCDIC_TO_LATIN1 are the actual binary values rather than their hex representations. For example, if you are using code page 500, the first 16 bytes of EBCDIC_TO_LATIN1 would be
'\x00\x01\x02\x03\x37\x2D\x2E\x2F\x16\x05\x25\x0B\x0C\x0D\x0E\x0F'
using this reference.
While this might not help the original poster anymore, some time ago I released a package for Python 2.6+ and 3.2+ that adds most of the western 8 bit mainframe codecs including CP1047 (French) and CP1141 (German): https://pypi.python.org/pypi/ebcdic. Simply import ebcdic to add the codecs and then use open(..., encoding='cp1047') to read or write files.
Answer 1:
Yet another silly question: What gave you the impression that recode produced only ASCII as output? AFAICT it will transcode ANY of its repertoire of charsets to ANY of its repertoire, AND its repertoire includes IBM cp500 and cp1047, and OF COURSE latin1. Reading the comments, you will note that Lennaert and I have discovered that there aren't any "proprietary" codes in those two IBM character sets. So you may well be able to use recode after all, once you are certain what charset you've actually got.
Answer 2:
If you really need/want to transcode IBM cp1047 via Python, you might like to firstly get the mapping from an authoritative source, processing it via script with some checks:
URL = "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM1047-2.1.2.ucm"
"""
Sample lines:
<U0000> \x00 |0
<U0001> \x01 |0
<U0002> \x02 |0
<U0003> \x03 |0
<U0004> \x37 |0
<U0005> \x2D |0
"""
import urllib, re
text = urllib.urlopen(URL).read()
regex = r"<U([0-9a-fA-F]{4,4})>\s+\\x([0-9a-fA-F]{2,2})\s"
results = re.findall(regex, text)
wlist = [None] * 256
for result in results:
unum, inum = [int(x, 16) for x in result]
assert wlist[inum] is None
assert 0 <= unum <= 255
wlist[inum] = chr(unum)
assert not any(x is None for x in wlist)
print repr(''.join(wlist))
Then carefully copy/paste the output into your transcoding script for use with Vinay's buffer.translate(the_mapping) idea, with a buffer size perhaps a bit larger than 16KB and certainly a bit smaller than 2GB :-)
No crystal ball, no info from OP, so had a bit of a rummage in the EPO website. Found freely downloadable weekly patent info files, still available in cp500/SGML even though website says this to be replaced by utf8/XML in 2006 :-). Got the 2009 week 27 file. Is a zip containing 2 files s350927[ab].bin. "bin" means "not XML". Got the spec! Looks possible that "proprietary codes" are actually BINARY fields. Each record has a fixed 252-byte header. First 5 bytes are record length in EBCDIC e.g. hex F0F2F2F0F8 -> 2208 bytes. Last 2 bytes of the fixed header are the BINARY length (redundant) of the following variable part. In the middle are several text fields, two 2-byte binary fields, and one 4-byte binary field. The binary fields are serial numbers within groups, but all I saw are 1. The variable part is SGML.
Example (last record from s350927b.bin):
Record number: 7266
pprint of header text and binary slices:
['EPB102055619 TXT00000001',
1,
' 20090701200927 08013627.8 EP20090528NN ',
1,
1,
' T *lots of spaces snipped*']
Edited version of the rather long SGML:
<PATDOC FILE="08013627.8" CY=EP DNUM=2055619 KIND=B1 DATE=20090701 STATUS=N>
*snip*
<B541>DE<B542>Windschutzeinheit für ein Motorrad
<B541>EN<B542>Windshield unit for saddle-ride type vehicle
<B541>FR<B542>Unité pare-brise pour motocyclette</B540>
*snip*
</PATDOC>
There are no header or trailer records, just this one record format.
So: if the OP's annual files are anything like this, we might be able to help him out.
Update: Above was the "2 a.m. in my timezone" version. Here's a bit more info:
OP said: "at the beginning of each file there are a few digits in ASCII that tell you about the length of the file." ... translate that to "at the beginning of each record there are five digits in EBCDIC that tell you exactly the length of the record" and we have a (very fuzzy) match!
Here is the URL of the documentation page: http://docs.epoline.org/ebd/info.htm
The FIRST file mentioned is the spec.
Here is the URL of the download-weekly-data page: http://ebd2.epoline.org/jsp/ebdst35.jsp
An observation: The data that I looked at is in the ST.35 series. There is also available for download ST.32 which appears to be a parallel version containing only the SGML content (in "reduced cp437/850", one tag per line). This indicates that the fields in the fixed-length header of the ST.35 records may not be very interesting, and can thus be skipped over, which would greatly simplify the transcoding task.
For what it's worth, here is my (investigatory, written after midnight) code:
[Update 2: tidied up the code a little; no functionality changes]
from pprint import pprint as pp
import sys
from struct import unpack
HDRSZ = 252
T = '>s' # text
H = '>H' # binary 2 bytes
I = '>I' # binary 4 bytes
hdr_defn = [
6, T,
38, H,
40, T,
94, I,
98, H,
100, T,
251, H, # length of following SGML text
HDRSZ + 1
]
# above positions as per spec, reduce to allow for counting from 1
for i in xrange(0, len(hdr_defn), 2):
hdr_defn[i] -= 1
def records(fname, output_encoding='latin1', debug=False):
xlator=''.join(chr(i).decode('cp500').encode(output_encoding, 'replace') for i in range(256))
# print repr(xlator)
def xlate(ebcdic):
return ebcdic.translate(xlator)
# return ebcdic.decode('cp500') # use this if unicode output desired
f = open(fname, 'rb')
recnum = -1
while True:
# get header
buff = f.read(HDRSZ)
if not buff:
return # EOF
recnum += 1
if debug: print "\nrecnum", recnum
assert len(buff) == HDRSZ
recsz = int(xlate(buff[:5]))
if debug: print "recsz", recsz
# split remainder of header into text and binary pieces
fields = []
for i in xrange(0, len(hdr_defn) - 2, 2):
ty = hdr_defn[i + 1]
piece = buff[hdr_defn[i]:hdr_defn[i+2]]
if ty == T:
fields.append(xlate(piece))
else:
fields.append(unpack(ty, piece)[0])
if debug: pp(fields)
sgmlsz = fields.pop()
if debug: print "sgmlsz: %d; expected: %d - %d = %d" % (sgmlsz, recsz, HDRSZ, recsz - HDRSZ)
assert sgmlsz == recsz - HDRSZ
# get sgml part
sgml = f.read(sgmlsz)
assert len(sgml) == sgmlsz
sgml = xlate(sgml)
if debug: print "sgml", sgml
yield recnum, fields, sgml
if __name__ == "__main__":
maxrecs = int(sys.argv[1]) # dumping out the last `maxrecs` records in the file
fname = sys.argv[2]
keep = [None] * maxrecs
for recnum, fields, sgml in records(fname):
# do something useful here
keep[recnum % maxrecs] = (recnum, fields, sgml)
keep.sort()
for k in keep:
if k:
recnum, fields, sgml = k
print
print recnum
pp(fields)
print sgml
Assuming cp500 contains all of your "additional propietary characters", a more concise version based on Lennart's answer using the codecs module:
import sys, codecs
BUFFER_SIZE = 64*1024
ebd_file = codecs.open(sys.argv[1], 'r', 'cp500')
latin1_file = codecs.open(sys.argv[2], 'w', 'latin1')
buffer = ebd_file.read(BUFFER_SIZE)
while buffer:
latin1_file.write(buffer)
buffer = ebd_file.read(BUFFER_SIZE)
ebd_file.close()
latin1_file.close()

Categories