Related
I have three text values that I am encrypting and then writing to a file. Later I want to read the values back (in another script) and decrypt them.
I've successfully encrypted the values:
cenc = rsa.encrypt(client_name.encode('utf8'), publicKey)
denc = rsa.encrypt(expiry_date.encode('utf8'), publicKey)
fenc = rsa.encrypt(features.encode('utf8'), publicKey)
and written to a binary file:
licensefh = open("license.sfb", "wb")
licensefh.write(cenc)
licensefh.write(denc)
licensefh.write(fenc)
licensefh.close()
The three values cenc, denc and fenc are all of different lengths so when I read the file back:
licensefh = open("license.sfb", "rb")
encMessage = licensefh.read()
encMessage contains the entire file and I don't know how to get the three values back again.
I've tried using a separator between the values:
SEP = bytes(chr(0x02).encode('utf8'))
...
licensefh.write(cenc)
licensefh.write(SEP)
...
and then using encMessage.partition(SEP) or encMessage.split(SEP) but the data invariably contains the SEP value in it somewhere (I've tried a few different characters) so that didn't work.
I tried getting the length of the bytes objects cenc, denc and fenc, but this returned 256 for each value even though the contents of the variables are all different lengths.
My question is this. How do I write these three variable length values to a binary file and then separate them when I read them back again?
Here's an example of the 3 binary values:
b'tX\x10Fo\x89\x10~\x83Pok\xd1\xfb\xbe\x0e<a\xe5\x11md:\xe6\x84#\xfa\xf8\xe5\xeb\xf8\xdc{\xc0Z\xa0\xc0^\xc1\xd9\x820\xec\xec\xb0R\x99/\xa2l\x88\xa9\xa6g\xa3\x01m\xf9\x7f\x91\xb9\xe1\x80\xccs|\xb7_\xa9Fp\x11yvG\xdc\x02d\x8aK2\x92t\x0e\x1f\xca\x19\xbb&\xaf{\xc0y>\t|\x86\xab\x16.\xa5kZ"\xab6\xaaV\xf4w\x7f\xc5q\x07\xef\xa9\xa5\xa3\xf3 6\xdb\x03\x19S\xbd\x81\xf9\xc8\xc5\x90\x1e\x19\x86\xa4q\xe3?i\xc4\xac\t\xd5=3C\x9b#\xc3IuAN,\xeat\xc6\x96VFL\x1eFWZ\xa4\xd73\x92P#\x1d\xb9\x12\x15\xc9\xd4~\x8aWm^\xb8\x8b\x9d\x88\n)\xeb#\xe3\x93\xb1\\\xd6^\xe0\xce\xa2(\x05\xf5\xe6\x8b\xd1\x15\xd8v\xf0\xae\x90\xd8?\x01\r\x00\xf4\xa5\xadM|%\x98\xa9SR\xc6\xd0K\x9e&\xc3\xe0M\x81\x87\xdea\xcc\xd5\x9c\xcd\xfd1l\x1f\xb9?\xed\xd1\x95\xbc\x11\x85U9'
b'l\xd3S\xcc\x03\x9a\xf2\xfdr\xca\xbbA\x06\xfb\xd8\xbbWi\xdc\xb1\xf6&\x97T\x81Kl\r\x86\x9b\x95?\x94}\x8a\xd3\xa1V\x81\xd3]*B\x1f\x96`\xa3\xd1\xf2|B\x84?\xa0\ns\xb7\xcf\x18Y\x87\xcfR\x87!\x14\x81!\xf7\xf2\xe5x|=O\xe3\xba2\xf2!\x93\x0fT7\x0c~4\xa3\xe5\xb7\xf9wy\xb5\x12FM\x96\xd9\xfd\xedn\x9c\xacw\x1b\xc2\x17+\xb6\x05`\x10\xf8\xe4\x01\xde\xc7\xa2\xa0\x80\xd8\x15\xb1+<s\xc7\x19\x9c\x14\xb0\x1a"\x10\xbb\x0f\xe1\x05\x93\xd2?xX\xd9\x93\x8an\x8d\xcd\xbd!c\xd0,\xa45\xbai\xe3\xccx\x08\xaa,\xd1\xe5\'t\x91\xb8\xf2n$\x0c\xf9-\xb4\xc2\x07\x81\xe1\xe7\x8e\xb3\x98\x11\xf3\xa6\xd9wz\x9a3\xc9\x9c?z\xd8\xaa\x08}\xa2\x9c[\xf2\x9d\xe4\xcdb\xddl\xceV\x7f\xf1\x81\xb3\x88\x1e\x9c5?k\x0f\xc9\x86\x86&\xedV.\xa7\x8d\x13&V\xad\xca\xe5\x93\xfe\xa5\x94\xbc\xf5\xd1{Cl\xc0\x030\x92\x03\xc9'
b'#\xbdd7\xe9\xa0{\t\xb9\x87B\x9e\xf9\x97P^\xf3V\xb6\x93\x1f(J\x0b\xa3\xbf\xd8\x04\x86T\xa4\xca\xf3\xe8%\xddC\x11\xdb5\xff,\xf7\x13\xd7\xd2\xbc\xf3\x893\x83\xdcmJ\xc8p\xdf\x07V\x7fb\xeb\xa9\x8b\x0f\xca\xf9\x05\xfc\xdfS\x94b\x90\xcd\xfcn?/]\x11\xaf\xe606\xfb\\U59\xa0>\xbd\xd8\x1c\xa8\xca\x83\xf4C\x95v7\xc6\xe00\xe4,d_/\x83\xa0\xb9mO\x0e\xc4\x97J\x15\xf0\xca-\xa0\xafT\xe4\x82\x03\n\x14:\xa1\xdcL\x98\x9d,1\xfa\x10\xf4\xfd\xa0\x0b\xc7\x13!\xf7\xdb/\xda\x1a\x9df\x1cQ\xc0\x99H\x08\xa0c\x8f9/4\xc4\x05\xc6\x9eM\x8e\xe5V\xf8D\xc3\xfd\xad4\x94A\xb9[\x80\xb9\xcf\xe6\xd9\xb3M2\xd9N\xfbA\x18\x84/W\x9b\x92\xfe\xbb\xd6C\x85\xa3\xc6\xd2T\xd0\xb2\xb9\xf7R\xb4(s\xda\xbcX,9w\x17\x1c\xfb|\xa0\x87\xba\xca6>y\xba\\L4wc\x94\xe7$Y\x89\x07\x9b\xfe\x9b?{\x85'
#pippo1980 's comment is how I would do it, using struct :
import struct
cenc = b'tX\x10Fo\x89\x10~\x83Pok\xd1\xfb\xbe\x0e<a\xe5\x11md:\xe6\x84#\xfa\xf8\xe5\xeb\xf8\xdc{\xc0Z\xa0\xc0^\xc1\xd9\x820\xec\xec\xb0R\x99/\xa2l\x88\xa9\xa6g\xa3\x01m\xf9\x7f\x91\xb9\xe1\x80\xccs|\xb7_\xa9Fp\x11yvG\xdc\x02d\x8aK2\x92t\x0e\x1f\xca\x19\xbb&\xaf{\xc0y>\t|\x86\xab\x16.\xa5kZ"\xab6\xaaV\xf4w\x7f\xc5q\x07\xef\xa9\xa5\xa3\xf3 6\xdb\x03\x19S\xbd\x81\xf9\xc8\xc5\x90\x1e\x19\x86\xa4q\xe3?i\xc4\xac\t\xd5=3C\x9b#\xc3IuAN,\xeat\xc6\x96VFL\x1eFWZ\xa4\xd73\x92P#\x1d\xb9\x12\x15\xc9\xd4~\x8aWm^\xb8\x8b\x9d\x88\n)\xeb#\xe3\x93\xb1\\\xd6^\xe0\xce\xa2(\x05\xf5\xe6\x8b\xd1\x15\xd8v\xf0\xae\x90\xd8?\x01\r\x00\xf4\xa5\xadM|%\x98\xa9SR\xc6\xd0K\x9e&\xc3\xe0M\x81\x87\xdea\xcc\xd5\x9c\xcd\xfd1l\x1f\xb9?\xed\xd1\x95\xbc\x11\x85U9'
denc = b'l\xd3S\xcc\x03\x9a\xf2\xfdr\xca\xbbA\x06\xfb\xd8\xbbWi\xdc\xb1\xf6&\x97T\x81Kl\r\x86\x9b\x95?\x94}\x8a\xd3\xa1V\x81\xd3]*B\x1f\x96`\xa3\xd1\xf2|B\x84?\xa0\ns\xb7\xcf\x18Y\x87\xcfR\x87!\x14\x81!\xf7\xf2\xe5x|=O\xe3\xba2\xf2!\x93\x0fT7\x0c~4\xa3\xe5\xb7\xf9wy\xb5\x12FM\x96\xd9\xfd\xedn\x9c\xacw\x1b\xc2\x17+\xb6\x05`\x10\xf8\xe4\x01\xde\xc7\xa2\xa0\x80\xd8\x15\xb1+<s\xc7\x19\x9c\x14\xb0\x1a"\x10\xbb\x0f\xe1\x05\x93\xd2?xX\xd9\x93\x8an\x8d\xcd\xbd!c\xd0,\xa45\xbai\xe3\xccx\x08\xaa,\xd1\xe5\'t\x91\xb8\xf2n$\x0c\xf9-\xb4\xc2\x07\x81\xe1\xe7\x8e\xb3\x98\x11\xf3\xa6\xd9wz\x9a3\xc9\x9c?z\xd8\xaa\x08}\xa2\x9c[\xf2\x9d\xe4\xcdb\xddl\xceV\x7f\xf1\x81\xb3\x88\x1e\x9c5?k\x0f\xc9\x86\x86&\xedV.\xa7\x8d\x13&V\xad\xca\xe5\x93\xfe\xa5\x94\xbc\xf5\xd1{Cl\xc0\x030\x92\x03\xc9'
fenc = b'#\xbdd7\xe9\xa0{\t\xb9\x87B\x9e\xf9\x97P^\xf3V\xb6\x93\x1f(J\x0b\xa3\xbf\xd8\x04\x86T\xa4\xca\xf3\xe8%\xddC\x11\xdb5\xff,\xf7\x13\xd7\xd2\xbc\xf3\x893\x83\xdcmJ\xc8p\xdf\x07V\x7fb\xeb\xa9\x8b\x0f\xca\xf9\x05\xfc\xdfS\x94b\x90\xcd\xfcn?/]\x11\xaf\xe606\xfb\\U59\xa0>\xbd\xd8\x1c\xa8\xca\x83\xf4C\x95v7\xc6\xe00\xe4,d_/\x83\xa0\xb9mO\x0e\xc4\x97J\x15\xf0\xca-\xa0\xafT\xe4\x82\x03\n\x14:\xa1\xdcL\x98\x9d,1\xfa\x10\xf4\xfd\xa0\x0b\xc7\x13!\xf7\xdb/\xda\x1a\x9df\x1cQ\xc0\x99H\x08\xa0c\x8f9/4\xc4\x05\xc6\x9eM\x8e\xe5V\xf8D\xc3\xfd\xad4\x94A\xb9[\x80\xb9\xcf\xe6\xd9\xb3M2\xd9N\xfbA\x18\x84/W\x9b\x92\xfe\xbb\xd6C\x85\xa3\xc6\xd2T\xd0\xb2\xb9\xf7R\xb4(s\xda\xbcX,9w\x17\x1c\xfb|\xa0\x87\xba\xca6>y\xba\\L4wc\x94\xe7$Y\x89\x07\x9b\xfe\x9b?{\x85'
packing_format = "<HHH" # little-endian, 3 * (2-byte unsigned short)
with open("license.sfb", "wb") as licensefh:
licensefh.write(struct.pack(packing_format, len(cenc), len(denc), len(fenc)))
licensefh.write(cenc)
licensefh.write(denc)
licensefh.write(fenc)
# close is automatic with a context-manager
with open("license.sfb", "rb") as licensefh2:
header_length = struct.calcsize(packing_format)
cenc2_len, denc2_len, fenc2_len = struct.unpack(packing_format, licensefh2.read(header_length))
cenc2 = licensefh2.read(cenc2_len)
denc2 = licensefh2.read(denc2_len)
fenc2 = licensefh2.read(fenc2_len)
assert len(cenc2) == cenc2_len and len(denc2) == denc2_len and len(fenc2) == fenc2_len # the file was not truncated
unread_bytes = licensefh2.read() # until EOF
assert len(unread_bytes) == 0 # there is nothing else in the file, everything has been read
assert cenc == cenc2
assert denc == denc2
assert fenc == fenc2
how to decode Decoding Multiple CDR Records from a File with asn1tools..
this is my python code:
### input file name
fileName='D:/Python/Asn1/SIO/bHWMSC12021043012454329.dat'
## create Dict from asn file struct
Foo = asn1tools.compile_files('asn_Huawei.asn',cache_dir='My-Cache',numeric_enums=True)
## Open binary file
with open(fileName,"rb+") as binaryfile:
buffer = binaryfile.read()
## Match and decode all record with Dict
decoded = Foo.decode('CallEventRecord',buffer)
print(decoded)
print(decoded) give only first record. My file contain 1550 records.... how to read Tag by tag my file with asn1tools
I got the same issue, trying to figure it out. You can +1 this issue
I managed to decode multiCDR files with a combination of pyasn1 and asn1tools, I'll update when it is fully tested.
I would use "decode_with_length" instead of decode.
decoded, lenDecoded = Foo.decode_with_length('CallEventRecord',buffer)
buffer = buffer[lenDecoded:]
Then just loop until the buffer is empty.
For me (it highly depends how your Files are actually internally structured and what your goal is) the so far fastest method is to use find to jump to the next relevant ticket, mine always start with 'b\xa7':
with open(file, 'rb') as encoded_bytes:
data = encoded_bytes.read()
encoded_bytes.close()
file_len = len(data)
while index < file_len:
next_occurence = data[index:].find(b'\xa7')
if next_occurence < 0:
break
....
index += next_occurence
decoded_record, record_length = schema.decode_with_length('CallEventRecord', data[index:])
.....
index += 1
continue
I'm trying to extract the date/time when a picture was taken from the CR2 (Canon format for raw pictures).
I know the CR2 specification, and I know I can use Python struct module to extract pieces from a binary buffer.
Briefly, the specification says that in Tag 0x0132 / 306 I can find an string of length 20 - the date and time.
I tried to get that tag by using:
struct.unpack_from(20*'s', buffer, 0x0132)
but I get
('\x00', '\x00', "'", '\x88, ...[and more crap])
Any ideas?
Edit
Many thanks for the thorough effort! The answers are phenomenal and I learned a lot about handling binary data.
Have you taken into account the header which should (according to the spec) precede the IFD block you're talking about?
I looked through the spec and it says the first IFD block follows the 16 byte header. So if we read bytes 16 and 17 (at offset 0x10 hex) we should get the number of entries in the first IFD block. Then we just have to search through each entry until we find a matching tag id which (as I read it) gives us the byte offset of your date / time string.
This works for me:
from struct import *
def FindDateTimeOffsetFromCR2( buffer, ifd_offset ):
# Read the number of entries in IFD #0
(num_of_entries,) = unpack_from('H', buffer, ifd_offset)
print "ifd #0 contains %d entries"%num_of_entries
# Work out where the date time is stored
datetime_offset = -1
for entry_num in range(0,num_of_entries-1):
(tag_id, tag_type, num_of_value, value) = unpack_from('HHLL', buffer, ifd_offset+2+entry_num*12)
if tag_id == 0x0132:
print "found datetime at offset %d"%value
datetime_offset = value
return datetime_offset
if __name__ == '__main__':
with open("IMG_6113.CR2", "rb") as f:
buffer = f.read(1024) # read the first 1kb of the file should be enough to find the date / time
datetime_offset = FindDateTimeOffsetFromCR2(buffer, 0x10)
print unpack_from(20*'s', buffer, datetime_offset)
Output for my example file is:
ifd #0 contains 14 entries
found datetime at offset 250
('2', '0', '1', '0', ':', '0', '8', ':', '0', '1', ' ', '2', '3', ':', '4', '5', ':', '4', '6', '\x00')
[edit] - a revised / more thorough example
from struct import *
recognised_tags = {
0x0100 : 'imageWidth',
0x0101 : 'imageLength',
0x0102 : 'bitsPerSample',
0x0103 : 'compression',
0x010f : 'make',
0x0110 : 'model',
0x0111 : 'stripOffset',
0x0112 : 'orientation',
0x0117 : 'stripByteCounts',
0x011a : 'xResolution',
0x011b : 'yResolution',
0x0128 : 'resolutionUnit',
0x0132 : 'dateTime',
0x8769 : 'EXIF',
0x8825 : 'GPS data'};
def GetHeaderFromCR2( buffer ):
# Unpack the header into a tuple
header = unpack_from('HHLHBBL', buffer)
print "\nbyte_order = 0x%04X"%header[0]
print "tiff_magic_word = %d"%header[1]
print "tiff_offset = 0x%08X"%header[2]
print "cr2_magic_word = %d"%header[3]
print "cr2_major_version = %d"%header[4]
print "cr2_minor_version = %d"%header[5]
print "raw_ifd_offset = 0x%08X\n"%header[6]
return header
def FindDateTimeOffsetFromCR2( buffer, ifd_offset, endian_flag ):
# Read the number of entries in IFD #0
(num_of_entries,) = unpack_from(endian_flag+'H', buffer, ifd_offset)
print "Image File Directory #0 contains %d entries\n"%num_of_entries
# Work out where the date time is stored
datetime_offset = -1
# Go through all the entries looking for the datetime field
print " id | type | number | value "
for entry_num in range(0,num_of_entries):
# Grab this IFD entry
(tag_id, tag_type, num_of_value, value) = unpack_from(endian_flag+'HHLL', buffer, ifd_offset+2+entry_num*12)
# Print out the entry for information
print "%04X | %04X | %08X | %08X "%(tag_id, tag_type, num_of_value, value),
if tag_id in recognised_tags:
print recognised_tags[tag_id]
# If this is the datetime one we're looking for, make a note of the offset
if tag_id == 0x0132:
assert tag_type == 2
assert num_of_value == 20
datetime_offset = value
return datetime_offset
if __name__ == '__main__':
with open("IMG_6113.CR2", "rb") as f:
# read the first 1kb of the file should be enough to find the date/time
buffer = f.read(1024)
# Grab the various parts of the header
(byte_order, tiff_magic_word, tiff_offset, cr2_magic_word, cr2_major_version, cr2_minor_version, raw_ifd_offset) = GetHeaderFromCR2(buffer)
# Set the endian flag
endian_flag = '#'
if byte_order == 0x4D4D:
# motorola format
endian_flag = '>'
elif byte_order == 0x4949:
# intel format
endian_flag = '<'
# Search for the datetime entry offset
datetime_offset = FindDateTimeOffsetFromCR2(buffer, 0x10, endian_flag)
datetime_string = unpack_from(20*'s', buffer, datetime_offset)
print "\nDatetime: "+"".join(datetime_string)+"\n"
0x0132 is not the offset, it's the tag number of the date. CR2 or TIFF, respectively, is a directory based format. You have to look up the entry given your the (known) tag you are looking for.
Edit:
Ok, first of all, you have to read if the file data is saved using little or big-endian format. The first eight byte specify the header, and the first two byte of that header specify the endianness. Python's struct module allows you to handle little and big endian data by prefixing a format string with either '<' or '>'. So, assuming data is a buffer containing your CR2 image, you can handle endianness via
header = data[:8]
endian_flag = "<" if header[:2] == "II" else ">"
The format specification states that the first image file directory begins at an offset relative to the beginning of the file, with the offset being specified in the last 4 bytes of the header. So, to get the offset to the first IFD, you can use a line similar to this one:
ifd_offset = struct.unpack("{0}I".format(endian_flag), header[4:])[0]
You can now go ahead and read the first IFD. You will find the number of entries in the directory at the specified offset into the file, which is two bytes wide. Thus, you would read the number of entries in the first IFD using:
number_of_entries = struct.unpack("{0}H".format(endian_flag), data[ifd_offset:ifd_offset+2])[0]
A field entry is 12 bytes long, so you can calculate the length of the IFD. After number_of_entries * 12 bytes, there will be another 4 byte long offset, telling you where to look for the next directory. That is basically how you work with TIFF and CR2 images.
The "magic" here is to note that with each of the 12 byte field entries, the first two bytes will be the tag ID. And that is where you look for your tag 0x0132. So, given you know that the first IFD starts at ifd_offset in the file, you can scan the first directory via:
current_position = ifd_offset + 2
for field_offset in xrange(current_position, number_of_entries*12, 12):
field_tag = struct.unpack("{0}H".format(endian_flag), data[field_offset:field_offset+2])[0]
field_type = struct.unpack("{0}H".format(endian_flag), data[field_offset+2:field_offset+4])[0]
value_count = struct.unpack("{0}I".format(endian_flag), data[field_offset+4:field_offset+8])[0]
value_offset = struct.unpack("{0}I".format(endian_flag), data[field_offset+8:field_offset+12])[0]
if field_tag == 0x0132:
# You are now reading a field entry containing the date and time
assert field_type == 2 # Type 2 is ASCII
assert value_count == 20 # You would expect a string length of 20 here
date_time = struct.unpack("20s", data[value_offset:value_offset+20])
print date_time
You'd obviously want to refactor that unpacking into a common function and probably wrap the whole format into a nice class, but that is beyond the scope of this example. You can also shorten the unpacking by combining multiple format strings into one, yielding a larger tuple containing all the fields you can unpack into distinct variables, which I left out for clarity.
I found that EXIF.py from https://github.com/ianare/exif-py reads the EXIF data from .CR2 files. It seems that because .CR2 files are based on .TIFF files EXIF.py is compatible.
import EXIF
import time
# Change the filename to be suitable for you
f = open('../DCIM/100CANON/IMG_3432.CR2', 'rb')
data = EXIF.process_file(f)
f.close()
date_str = data['EXIF DateTimeOriginal'].values
# We have the raw data
print date_str
# We can now convert it
date = time.strptime(date_str, '%Y:%m:%d %H:%M:%S')
print date
And this prints:
2011:04:30 11:08:44
(2011, 4, 30, 11, 8, 44, 5, 120, -1)
I am trying to decode data received over a tcp connection. The packets are small, no more than 100 bytes. However when there is a lot of them I receive some of the the packets joined together. Is there a way to prevent this. I am using python
I have tried to separate the packets, my source is below. The packets start with STX byte and end with ETX bytes, the byte following the STX is the packet length, (packet lengths less than 5 are invalid) the checksum is the last bytes before the ETX
def decode(data):
while True:
start = data.find(STX)
if start == -1: #no stx in message
pkt = ''
data = ''
break
#stx found , next byte is the length
pktlen = ord(data[1])
#check message ends in ETX (pktken -1) or checksum invalid
if pktlen < 5 or data[pktlen-1] != ETX or checksum_valid(data[start:pktlen]) == False:
print "Invalid Pkt"
data = data[start+1:]
continue
else:
pkt = data[start:pktlen]
data = data[pktlen:]
break
return data , pkt
I use it like this
#process reports
try:
data = sock.recv(256)
except: continue
else:
while data:
data, pkt = decode(data)
if pkt:
process(pkt)
Also if there are multiple packets in the data stream, is it best to return the packets as a collection of lists or just return the first packet
I am not that familiar with python, only C, is this method OK. Any advice would be most appreciated. Thanks in advance
Thanks
I would create a class that is responsible for decoding the packets from a stream, like this:
class PacketDecoder(object):
STX = ...
ETX = ...
def __init__(self):
self._stream = ''
def feed(self, buffer):
self._stream += buffer
def decode(self):
'''
Yields packets from the current stream.
'''
while len(self._stream) > 2:
end = self._stream.find(self.ETX)
if end == -1:
break
packet_len = ord(self._stream[1])
packet = self._stream[:end]
if packet_len >= 5 and check_sum_valid(packet):
yield packet
self._stream = self._stream[end+1:]
And then use like this:
decoder = PacketDecoder()
while True:
data = sock.recv(256)
if not data:
# handle lost connection...
decoder.feed(data)
for packet in decoder.decode():
process(packet)
TCP provides a data stream, not individual packets, at the interface level. If you want discrete packets, you can use UDP (and handle lost or out of order packets on your own), or put some data separator inline. It sounds like you are doing that already, with STX/ETX as your separators. However, as you note, you get multiple messages in one data chunk from your TCP stack.
Note that unless you are doing some other processing, data in the code you show does not necessarily contain an integral number of messages. That is, it is likely that the last STX will not have a matching ETX. The ETX will be in the next data chunk without an STX.
You should probably read individual messages from the TCP data stream and return them as they occur.
Try scapy, a powerful interactive packet manipulation program.
Where does the data come from ? Instead of trying to decode it by hand, why not use the excellent Impacket package:
http://oss.coresecurity.com/projects/impacket.html
Nice and simple... :)
The trick is in the file object.
f=sock.makefile()
while True:
STX = f.read(1)
pktlen = f.read(1)
wholePacket = STX + pktlen + f.read(ord(pktlen)-2)
doSomethingWithPacket(wholePacket)
And that's it! (There is also no need to check checksums when using TCP.)
And here is a more "robust"(?) version (it uses STX and checksum):
f=sock.makefile()
while True:
while f.read(1)!=STX:
continue
pktlen = f.read(1)
wholePacket = STX + pktlen + f.read(ord(pktlen)-2)
if checksum_valid(wholePacket):
doSomethingWithPacket(wholePacket)
I have to convert a number of large files (up to 2GB) of EBCDIC 500 encoded files to Latin-1. Since I could only find EBCDIC to ASCII converters (dd, recode) and the files contain some additional proprietary character codes, I thought I'd write my own converter.
I have the character mapping so I'm interested in the technical aspects.
This is my approach so far:
# char mapping lookup table
EBCDIC_TO_LATIN1 = {
0xC1:'41', # A
0xC2:'42', # B
# and so on...
}
BUFFER_SIZE = 1024 * 64
ebd_file = file(sys.argv[1], 'rb')
latin1_file = file(sys.argv[2], 'wb')
buffer = ebd_file.read(BUFFER_SIZE)
while buffer:
latin1_file.write(ebd2latin1(buffer))
buffer = ebd_file.read(BUFFER_SIZE)
ebd_file.close()
latin1_file.close()
This is the function that does the converting:
def ebd2latin1(ebcdic):
result = []
for ch in ebcdic:
result.append(EBCDIC_TO_LATIN1[ord(ch)])
return ''.join(result).decode('hex')
The question is whether or not this is a sensible approach from an engineering standpoint. Does it have some serious design issues? Is the buffer size OK? And so on...
As for the "proprietary characters" that some don't believe in: Each file contains a year's worth of patent documents in SGML format. The patent office has been using EBCDIC until they switched to Unicode in 2005. So there are thousands of documents within each file. They are separated by some hex values that are not part of any IBM specification. They were added by the patent office. Also, at the beginning of each file there are a few digits in ASCII that tell you about the length of the file. I don't really need that information but if I want to process the file so I have to deal with them.
Also:
$ recode IBM500/CR-LF..Latin1 file.ebc
recode: file.ebc failed: Ambiguous output in step `CR-LF..data'
Thanks for the help so far.
EBCDIC 500, aka Code Page 500, is amongst Pythons encodings, although you link to cp1047, which doesn't. Which one are you using, really? Anyway this works for cp500 (or any other encoding that you have).
from __future__ import with_statement
import sys
from contextlib import nested
BUFFER_SIZE = 16384
with nested(open(sys.argv[1], 'rb'), open(sys.argv[2], 'wb')) as (infile, outfile):
while True:
buffer = infile.read(BUFFER_SIZE)
if not buffer:
break
outfile.write(buffer.decode('cp500').encode('latin1'))
This way you shouldn't need to keep track of the mappings yourself.
If you set up the table correctly, then you just need to do:
translated_chars = ebcdic.translate(EBCDIC_TO_LATIN1)
where ebcdic contains EBCDIC characters and EBCDIC_TO_LATIN1 is a 256-char string which maps each EBCDIC character to its Latin-1 equivalent. The characters in EBCDIC_TO_LATIN1 are the actual binary values rather than their hex representations. For example, if you are using code page 500, the first 16 bytes of EBCDIC_TO_LATIN1 would be
'\x00\x01\x02\x03\x37\x2D\x2E\x2F\x16\x05\x25\x0B\x0C\x0D\x0E\x0F'
using this reference.
While this might not help the original poster anymore, some time ago I released a package for Python 2.6+ and 3.2+ that adds most of the western 8 bit mainframe codecs including CP1047 (French) and CP1141 (German): https://pypi.python.org/pypi/ebcdic. Simply import ebcdic to add the codecs and then use open(..., encoding='cp1047') to read or write files.
Answer 1:
Yet another silly question: What gave you the impression that recode produced only ASCII as output? AFAICT it will transcode ANY of its repertoire of charsets to ANY of its repertoire, AND its repertoire includes IBM cp500 and cp1047, and OF COURSE latin1. Reading the comments, you will note that Lennaert and I have discovered that there aren't any "proprietary" codes in those two IBM character sets. So you may well be able to use recode after all, once you are certain what charset you've actually got.
Answer 2:
If you really need/want to transcode IBM cp1047 via Python, you might like to firstly get the mapping from an authoritative source, processing it via script with some checks:
URL = "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM1047-2.1.2.ucm"
"""
Sample lines:
<U0000> \x00 |0
<U0001> \x01 |0
<U0002> \x02 |0
<U0003> \x03 |0
<U0004> \x37 |0
<U0005> \x2D |0
"""
import urllib, re
text = urllib.urlopen(URL).read()
regex = r"<U([0-9a-fA-F]{4,4})>\s+\\x([0-9a-fA-F]{2,2})\s"
results = re.findall(regex, text)
wlist = [None] * 256
for result in results:
unum, inum = [int(x, 16) for x in result]
assert wlist[inum] is None
assert 0 <= unum <= 255
wlist[inum] = chr(unum)
assert not any(x is None for x in wlist)
print repr(''.join(wlist))
Then carefully copy/paste the output into your transcoding script for use with Vinay's buffer.translate(the_mapping) idea, with a buffer size perhaps a bit larger than 16KB and certainly a bit smaller than 2GB :-)
No crystal ball, no info from OP, so had a bit of a rummage in the EPO website. Found freely downloadable weekly patent info files, still available in cp500/SGML even though website says this to be replaced by utf8/XML in 2006 :-). Got the 2009 week 27 file. Is a zip containing 2 files s350927[ab].bin. "bin" means "not XML". Got the spec! Looks possible that "proprietary codes" are actually BINARY fields. Each record has a fixed 252-byte header. First 5 bytes are record length in EBCDIC e.g. hex F0F2F2F0F8 -> 2208 bytes. Last 2 bytes of the fixed header are the BINARY length (redundant) of the following variable part. In the middle are several text fields, two 2-byte binary fields, and one 4-byte binary field. The binary fields are serial numbers within groups, but all I saw are 1. The variable part is SGML.
Example (last record from s350927b.bin):
Record number: 7266
pprint of header text and binary slices:
['EPB102055619 TXT00000001',
1,
' 20090701200927 08013627.8 EP20090528NN ',
1,
1,
' T *lots of spaces snipped*']
Edited version of the rather long SGML:
<PATDOC FILE="08013627.8" CY=EP DNUM=2055619 KIND=B1 DATE=20090701 STATUS=N>
*snip*
<B541>DE<B542>Windschutzeinheit für ein Motorrad
<B541>EN<B542>Windshield unit for saddle-ride type vehicle
<B541>FR<B542>Unité pare-brise pour motocyclette</B540>
*snip*
</PATDOC>
There are no header or trailer records, just this one record format.
So: if the OP's annual files are anything like this, we might be able to help him out.
Update: Above was the "2 a.m. in my timezone" version. Here's a bit more info:
OP said: "at the beginning of each file there are a few digits in ASCII that tell you about the length of the file." ... translate that to "at the beginning of each record there are five digits in EBCDIC that tell you exactly the length of the record" and we have a (very fuzzy) match!
Here is the URL of the documentation page: http://docs.epoline.org/ebd/info.htm
The FIRST file mentioned is the spec.
Here is the URL of the download-weekly-data page: http://ebd2.epoline.org/jsp/ebdst35.jsp
An observation: The data that I looked at is in the ST.35 series. There is also available for download ST.32 which appears to be a parallel version containing only the SGML content (in "reduced cp437/850", one tag per line). This indicates that the fields in the fixed-length header of the ST.35 records may not be very interesting, and can thus be skipped over, which would greatly simplify the transcoding task.
For what it's worth, here is my (investigatory, written after midnight) code:
[Update 2: tidied up the code a little; no functionality changes]
from pprint import pprint as pp
import sys
from struct import unpack
HDRSZ = 252
T = '>s' # text
H = '>H' # binary 2 bytes
I = '>I' # binary 4 bytes
hdr_defn = [
6, T,
38, H,
40, T,
94, I,
98, H,
100, T,
251, H, # length of following SGML text
HDRSZ + 1
]
# above positions as per spec, reduce to allow for counting from 1
for i in xrange(0, len(hdr_defn), 2):
hdr_defn[i] -= 1
def records(fname, output_encoding='latin1', debug=False):
xlator=''.join(chr(i).decode('cp500').encode(output_encoding, 'replace') for i in range(256))
# print repr(xlator)
def xlate(ebcdic):
return ebcdic.translate(xlator)
# return ebcdic.decode('cp500') # use this if unicode output desired
f = open(fname, 'rb')
recnum = -1
while True:
# get header
buff = f.read(HDRSZ)
if not buff:
return # EOF
recnum += 1
if debug: print "\nrecnum", recnum
assert len(buff) == HDRSZ
recsz = int(xlate(buff[:5]))
if debug: print "recsz", recsz
# split remainder of header into text and binary pieces
fields = []
for i in xrange(0, len(hdr_defn) - 2, 2):
ty = hdr_defn[i + 1]
piece = buff[hdr_defn[i]:hdr_defn[i+2]]
if ty == T:
fields.append(xlate(piece))
else:
fields.append(unpack(ty, piece)[0])
if debug: pp(fields)
sgmlsz = fields.pop()
if debug: print "sgmlsz: %d; expected: %d - %d = %d" % (sgmlsz, recsz, HDRSZ, recsz - HDRSZ)
assert sgmlsz == recsz - HDRSZ
# get sgml part
sgml = f.read(sgmlsz)
assert len(sgml) == sgmlsz
sgml = xlate(sgml)
if debug: print "sgml", sgml
yield recnum, fields, sgml
if __name__ == "__main__":
maxrecs = int(sys.argv[1]) # dumping out the last `maxrecs` records in the file
fname = sys.argv[2]
keep = [None] * maxrecs
for recnum, fields, sgml in records(fname):
# do something useful here
keep[recnum % maxrecs] = (recnum, fields, sgml)
keep.sort()
for k in keep:
if k:
recnum, fields, sgml = k
print
print recnum
pp(fields)
print sgml
Assuming cp500 contains all of your "additional propietary characters", a more concise version based on Lennart's answer using the codecs module:
import sys, codecs
BUFFER_SIZE = 64*1024
ebd_file = codecs.open(sys.argv[1], 'r', 'cp500')
latin1_file = codecs.open(sys.argv[2], 'w', 'latin1')
buffer = ebd_file.read(BUFFER_SIZE)
while buffer:
latin1_file.write(buffer)
buffer = ebd_file.read(BUFFER_SIZE)
ebd_file.close()
latin1_file.close()