Bytes with escape for re module - python

import re
a = 0xf27f28d7
b = a.to_bytes(4, 'little')
with open(fimage, 'rb') as f:
print([hex(m.start(0)] for m in re.finditer(b, f.read()))
# b = b'\xd7(\x7f\xf2'
above code return error:
missing ), unterminated subpattern at position 1
I know the bytes includes a (. It should be \(. How to convert to byte to ensure no error with escape?
I am using Python 3.6. Thanks in advance.

You can convert your bytes somehow like that
b = b'\xd7(\x7f\xf2'
b = ''.join(['\\x'+hex(c)[2:] for c in b]).encode()

You can use re.escape(), but it will also escape bytes that don't correspond to printable characters, so you'll need to filter them out.
import re
b = b'\xd7(\x7f\xf2'
k = []
for i in b:
i_bytes = bytes([i])
if i < 0x80 and chr(i).isprintable():
i_bytes = re.escape(i_bytes)
k.append(i_bytes)
print(b''.join(k)) # -> b'\xd7\\(\x7f\xf2'
I'm sure this code could be improved, just not sure how.

Related

String from file to string utf-8 in python

So I am reading and manipulate a file with :
base_file = open(path+'/'+base_name, "r")
lines = base_file.readlines()
After this I search and find the "raw_data" start of line.
if re.match("\s{0,100}raw_data: ",line):
split_line = line.split("raw_data:")
print(split_line)
raw_string = split_line[1]
One example of raw_data is:
raw_data: "&\276!\300\307 =\277\"O\271\277vH9?j?\345?#\243\264=\350\034\345\277\260\345\033\300\023\017(#z|\273\277L\}\277\210\\031\300\213\263z\277\302\241\033\300\000\207\323\277\247Oh>j\354\215#\364\305\201\276\361+\202#t:\304\277\344\231\243#\225k\002\300vw\262\277\362\220j\300\"(\337\276\354b8\300\230\347H\300\201\320\204\300S;N\300Z0G\300>j\210\000#\034\014\220#\231\330J#\223\025\236#\006\332\230\276\227\273\n\277\353#,#\202\205\215\277\340\356\022\300/\223\035\277\331\277\362\276a\350\013#)\353\276\277v6\316\277K\326\207\300`2)\300\004\014Q\300\340\267\271\300MV\305\300\327\010\207\300j\346o\300\377\260\216\300[\332g\300\336\266\003\300\320S\272?6\300Y#\356\250\034\300\367\277&\300\335Uq>o\010&\300r\277\252\300U\314\243\300\253d\377\300"
And raw_string will be
print(raw_data)
"&\276!\300\307
=\277\"O\271\277vH9?j?\345?#\243\264=\350\034\345\277\260\345\033\300\023\017(#z|\273\277L\}\277\210\\031\300\213\263z\277\302\241\033\300\000\207\323\277\247Oh>j\354\215#\364\305\201\276\361+\202#t:\304\277\344\231\243#\225k\002\300vw\262\277\362\220j\300\"(\337\276\354b8\300\230\347H\300\201\320\204\300S;N\300Z0G\300>j\210\000#\034\014\220#\231\330J#\223\025\236#\006\332\230\276\227\273\n\277\353#,#\202\205\215\277\340\356\022\300/\223\035\277\331\277\362\276a\350\013#)\353\276\277v6\316\277K\326\207\300`2)\300\004\014Q\300\340\267\271\300MV\305\300\327\010\207\300j\346o\300\377\260\216\300[\332g\300\336\266\003\300\320S\272?6\300Y#\356\250\034\300\367\277&\300\335Uq>o\010&\300r\277\252\300U\314\243\300\253d\377\300"
If I tried to read this file I will obtain one char to one char even for escape characters.
So, my question is how to transform this plain text to utf-8 string so that I can have one character when reading \300 and not 4 characters.
I tried to pass "encondig =utf-8" in open file method but does not work.
I have made the same example passing raw_data as variable and it works properly.
RAW_DATA = "&\276!\300\307 =\277\"O\271\277vH9?j?\345?#\243\264=\350\034\345\277\260\345\033\300\023\017(#z|\273\277L\\}\277\210\\\031\300\213\263z\277\302\241\033\300\000\207\323\277\247Oh>j\354\215#\364\305\201\276\361+\202#t:\304\277\344\231\243#\225k\002\300vw\262\277\362\220j\300\"(\337\276\354b8\300\230\347H\300\201\320\204\300S;N\300Z0G\300<I>>j\210\000#\034\014\220#\231\330J#\223\025\236#\006\332\230\276\227\273\n\277\353#,#\202\205\215\277\340\356\022\300/\223\035\277\331\277\362\276a\350\013#)\353\276\277v6\316\277K\326\207\300`2)\300\004\014Q\300\340\267\271\300MV\305\300\327\010\207\300j\346o\300\377\260\216\300[\332g\300\336\266\003\300\320S\272?6\300Y#\356\250\034\300\367\277&\300\335Uq>o\010&\300r\277\252\300U\314\243\300\253d\377\300"
print(f"Qnt -> {len(RAW_DATA)}") # Qnt -> 256
print(type(RAW_DATA))
at = 0
total = 0
while at < len(RAW_DATA):
fin = at+4
substrin = RAW_DATA[at:fin]
resu = FourString_float(substrin)
at = fin
For this example \300 is only one char.
Hope someone can help me.
The problem is that on the read file the escape \ symbols are coming in as \, but in the example you've provided they are being evaluated as part of the numerics that follow it. ie, \276 is read as a single character.
If you run:
RAW_DATA = r"&\276!\300\307 =\277\"O\271\277vH9?j?\345?#\243\264=\350\034\345\277\260\345\033\300\023\017(#z|\273\277L\\}\277\210\\\031\300\213\263z\277\302\241\033\300\000\207\323\277\247Oh>j\354\215#\364\305\201\276\361+\202#t:\304\277\344\231\243#\225k\002\300vw\262\277\362\220j\300\"(\337\276\354b8\300\230\347H\300\201\320\204\300S;N\300Z0G\300<I>>j\210\000#\034\014\220#\231\330J#\223\025\236#\006\332\230\276\227\273\n\277\353#,#\202\205\215\277\340\356\022\300/\223\035\277\331\277\362\276a\350\013#)\353\276\277v6\316\277K\326\207\300`2)\300\004\014Q\300\340\267\271\300MV\305\300\327\010\207\300j\346o\300\377\260\216\300[\332g\300\336\266\003\300\320S\272?6\300Y#\356\250\034\300\367\277&\300\335Uq>o\010&\300r\277\252\300U\314\243\300\253d\377\300"
print(f"Qnt -> {len(RAW_DATA)}") # Qnt -> 256
print(type(RAW_DATA))
at = 0
total = 0
while at < len(RAW_DATA):
fin = at+4
substrin = RAW_DATA[at:fin]
resu = FourString_float(substrin)
at = fin
You would should be getting the same error that you were getting originally. Notice that we are using the raw-string literal instead of regular string literal. This will ensure that the \ don't get escaped.
You would need to evaluate the RAW_DATA to force it to evaluate the \.
You can do something like RAW_DATA = eval(f'"{RAW_DATA}"') or
import ast
RAW_DATA = ast.literal_eval(f'"{RAW_DATA}"')
Note, the second option is a bit more secure that doing a straight eval as you are limiting the scope of what can be executed.

How to decode longest sub-bytes into str?

Suppose I read a long bytes object from somewhere, knowing it is utf-8 encoded. But the read may not fully consume the available content so that the last character in the stream may be incomplete. Calling bytes.decode() on this object may result in a decode error. But what really fails is only the last few bytes. Is there a function that works in this case, returning the longest decoded string and the remaining bytes?
utf-8 encodes a character into at most 4 bytes, so trying to decode truncated bytes should work, but a vast majority of computation will be wasted, and I don't really like this solution.
To give a simple but concrete example:
>>> b0 = b'\xc3\x84\xc3\x96\xc3'
>>> b1 = b'\x9c\xc3\x84\xc3\x96\xc3\x9c'
>>> (b0 + b1).decode()
>>> 'ÄÖÜÄÖÜ'
(b0 + b1).decode() is fine, but b0.decode() will raise. The solution should be able to decode b0 for as much as possible and return the bytes that cannot be decoded.
You are describing the basic usage of io.TextIOWrapper: a buffered text stream over a binary stream.
>>> import io
>>> txt = 'before\N{PILE OF POO}after'
>>> b = io.BytesIO(txt.encode('utf-8'))
>>> t = io.TextIOWrapper(b)
>>> t.read(5)
'befor'
>>> t.read(1)
'e'
>>> t.read(1)
'💩'
>>> t.read(1)
'a'
Contrast with reading a bytes stream directly, where it would be possible to read halfway through an encoded pile of poo:
>>> b.seek(0)
0
>>> b.read(5)
b'befor'
>>> b.read(1)
b'e'
>>> b.read(1)
b'\xf0'
>>> b.read(1)
b'\x9f'
>>> b.read(1)
b'\x92'
>>> b.read(1)
b'\xa9'
>>> b.read(1)
b'a'
Specify encoding="utf-8" if you want to be explicit. The default encoding, i.e. locale.getpreferredencoding(False), would usually be utf-8 anyway.
As I mentioned in the comments under #wim's answer, I think you could use the codecs.iterdecode() incremental decoder to do this. Since it's a generator function, there's no need to manually save and restore its state between iterative calls to it.
Here's how how it might be used to handle a situation like the one you described:
import codecs
from random import randint
def reader(sequence):
""" Yield random length chunks of sequence until exhausted. """
plural = lambda word, n, ending='s': (word+ending) if n > 1 else word
i = 0
while i < len(sequence):
size = randint(1, 4)
chunk = sequence[i: i+size]
hexrepr = '0x' + ''.join('%02X' % b for b in chunk)
print('read {} {}: {}'.format(size, plural('byte', len(chunk)), hexrepr))
yield chunk
i += size
bytes_obj = b'\xc3\x84\xc3\x96\xc3\x9c\xc3\x84\xc3\x96\xc3\x9c' # 'ÄÖÜÄÖÜ'
for decoded in codecs.iterdecode(reader(bytes_obj), 'utf-8'):
print(decoded)
Sample output:
read 3 bytes: 0xC384C3
Ä
read 1 byte: 0x96
Ö
read 1 byte: 0xC3
read 3 bytes: 0x9CC384
ÜÄ
read 2 bytes: 0xC396
Ö
read 4 bytes: 0xC39C
Ü

How to change the bytes in a file?

I'm making a encryption program and I need to open file in binary mode to access non-ascii and non-printable characters, I need to check if character from a file is letter, number, symbol or unprintable character. That means I have to check 1 by 1 if bytes (when they are decoded to ascii) match any of these characters:
{^9,dzEV=Q4ciT+/s};fnq3BFh% #2!k7>YSU<GyD\I]|OC_e.W0M~ua-jR5lv1wA`#8t*xr'K"[P)&b:g$p(mX6Ho?JNZL
I think I could encode these characters above to binary and then compare them with bytes. I don't know how to do this.
P.S. Sorry for bad English and binary misunderstanding. (I hope you
know what I mean by bytes, I mean characters in binary mode like
this):
\x01\x00\x9a\x9c\x18\x00
There are two major string types in Python: bytestrings (a sequence of bytes) that represent binary data and Unicode strings (a sequence of Unicode codepoints) that represent human-readable text. It is simple to convert one into another (☯):
unicode_text = bytestring.decode(character_encoding)
bytestring = unicode_text.encode(character_encoding)
If you open a file in binary mode e.g., 'rb' then file.read() returns a bytestring (bytes type):
>>> b'A' == b'\x41' == chr(0b1000001).encode()
True
There are several methods that can be used to classify bytes:
string methods such as bytes.isdigit():
>>> b'1'.isdigit()
True
string constants such as string.printable
>>> import string
>>> b'!' in string.printable.encode()
True
regular expressions such as \d
>>> import re
>>> bool(re.match(br'\d+$', b'123'))
True
classification functions in curses.ascii module e.g., curses.ascii.isprint()
>>> from curses import ascii
>>> bytearray(filter(ascii.isprint, b'123'))
bytearray(b'123')
bytearray is a mutable sequence of bytes — unlike a bytestring you can change it inplace e.g., to lowercase every 3rd byte that is uppercase:
>>> import string
>>> a = bytearray(b'ABCDEF_')
>>> uppercase = string.ascii_uppercase.encode()
>>> a[::3] = [b | 0b0100000 if b in uppercase else b
... for b in a[::3]]
>>> a
bytearray(b'aBCdEF_')
Notice: b'ad' are lowercase but b'_' remained the same.
To modify a binary file inplace, you could use mmap module e.g., to lowercase 4th column in every other line in 'file':
#!/usr/bin/env python3
import mmap
import string
uppercase = string.ascii_uppercase.encode()
ncolumn = 3 # select 4th column
with open('file', 'r+b') as file, \
mmap.mmap(file.fileno(), 0, access=mmap.ACCESS_WRITE) as mm:
while True:
mm.readline() # ignore every other line
pos = mm.tell() # remember current position
if not mm.readline(): # EOF
break
if mm[pos + ncolumn] in uppercase:
mm[pos + ncolumn] |= 0b0100000 # lowercase
Note: Python 2 and 3 APIs differ in this case. The code uses Python 3.
Input
ABCDE1
FGHIJ
ABCDE
FGHI
Output
ABCDE1
FGHiJ
ABCDE
FGHi
Notice: 4th column became lowercase on 2nd and 4h lines.
Typically if you want to change a file: you read from the file, write modifications to a temporary file, and on success you move the temporary file inplace of the original file:
#!/usr/bin/env python3
import os
import string
from tempfile import NamedTemporaryFile
caesar_shift = 3
filename = 'file'
def caesar_bytes(plaintext, shift, alphabet=string.ascii_lowercase.encode()):
shifted_alphabet = alphabet[shift:] + alphabet[:shift]
return plaintext.translate(plaintext.maketrans(alphabet, shifted_alphabet))
dest_dir = os.path.dirname(filename)
chunksize = 1 << 15
with open(filename, 'rb') as file, \
NamedTemporaryFile('wb', dir=dest_dir, delete=False) as tmp_file:
while True: # encrypt
chunk = file.read(chunksize)
if not chunk: # EOF
break
tmp_file.write(caesar_bytes(chunk, caesar_shift))
os.replace(tmp_file.name, filename)
Input
abc
def
ABC
DEF
Output
def
ghi
ABC
DEF
To convert the output back, set caesar_shift = -3.
To open a file in binary mode you use the open("filena.me", "rb") command. I've never used the command personally, but that should get you the information you need.

How to print out 0xfb in python

I'm falling the unicode hell.
My environment in on unix, python 2.7.3
LC_CTYPE=zh_TW.UTF-8
LANG=en_US.UTF-8
I'm trying to dump hex encoded data in human readable format, here is simplified code
#! /usr/bin/env python
# encoding:utf-8
import sys
s=u"readable\n" # previous result keep in unicode string
s2="fb is not \xfb" # data read from binary file
s += s2
print s # method 1
print s.encode('utf-8') # method 2
print s.encode('utf-8','ignore') # method 3
print s.decode('iso8859-1') # method 4
# method 1-4 display following error message
#UnicodeDecodeError: 'ascii' codec can't decode byte 0xfb
# in position 0: ordinal not in range(128)
f = open('out.txt','wb')
f.write(s)
I just want to print out the 0xfb.
I should describe more here. The key is 's += s2'.
Where s will keep my previous decoded string.
And the s2 is next string which should append into s.
If I modified as following, it occurs on write file.
s=u"readable\n"
s2="fb is not \xfb"
s += s2.decode('cp437')
print s
f=open('out.txt','wb')
f.write(s)
# UnicodeEncodeError: 'ascii' codec can't encode character
# u'\u221a' in position 1: ordinal not in range(128)
I wish the result of out.txt is
readable
fb is not \xfb
or
readable
fb is not 0xfb
[Solution]
#! /usr/bin/env python
# encoding:utf-8
import sys
import binascii
def fmtstr(s):
r = ''
for c in s:
if ord(c) > 128:
r = ''.join([r, "\\x"+binascii.hexlify(c)])
else:
r = ''.join([r, c])
return r
s=u"readable"
s2="fb is not \xfb"
s += fmtstr(s2)
print s
f=open('out.txt','wb')
f.write(s)
I strongly suspect that your code is actually erroring out on the previous line: the s += s2 one. s2 is just a series of bytes, which can't be arbitrarily tacked on to a unicode object (which is instead a series of code points).
If you had intended the '\xfb' to represent U+FB, LATIN SMALL LETTER U WITH CIRCUMFLEX, it would have been better to assign it like this instead:
s2 = u"\u00fb"
But you said that you just want to print out \xHH codes for control characters. If you just want it to be something humans can understand which still makes it apparent that special characters are in a string, then repr may be enough. First, don't have s be a unicode object, because you're treating your strings here as a series of bytes, not a series of code points.
s = s.encode('utf-8')
s += s2
print repr(s)
Finally, if you don't want the extra quotes on the outside that repr adds, for nice pretty printing or whatever, there's not a simple builtin way to do that in Python (that I know of). I've used something like this before:
import re
controlchars_re = re.compile(r'[\x00-\x31\x7f-\xff]')
def _show_control_chars(match):
txt = repr(match.group(0))
return txt[1:-1]
def escape_special_characters(s):
return controlchars_re.sub(_show_control_chars, s.replace('\\', '\\\\'))
You can pretty easily tweak the controlchars_re regex to define which characters you care about escaping.

decode base64 like string with different index table(s)

My problem is, that I have something encoded (base64 like) with a differnet index table:
0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/
instead of
ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/
so when I use base64.b64decode() it gives me a wrong result.
Is there a way to set this table durring conversion (as a parameter maybe)?
Or should I "convert" the wrong base64 string, I mean replace 0 to A, 1 to B, etc... and than use base64decode? if so what is the best and fast workaround for this?
update1: I use this, which works, but looks a bit slow, and unprofessional. :)
def correctbase64(str):
dicta = [ ['0','A'], ['1','B'], ['2','C'], ['3','D'], ['4','E'], ['5','F'], ['6','G'], ['7','H'], ['8','I'], ['9','J'], ['A','K'], ['B','L'], ['C','M'], ['D','N'], ['E','O'], ['F','P'], ['G','Q'], ['H','R'], ['I','S'], ['J','T'], ['K','U'], ['L','V'], ['M','W'], ['N','X'], ['O','Y'], ['P','Z'], ['Q','a'], ['R','b'], ['S','c'], ['T','d'], ['U','e'], ['V','f'], ['W','g'], ['X','h'], ['Y','i'], ['Z','j'], ['a','k'], ['b','l'], ['c','m'], ['d','n'], ['e','o'], ['f','p'], ['g','q'], ['h','r'], ['i','s'], ['j','t'], ['k','u'], ['l','v'], ['m','w'], ['n','x'], ['o','y'], ['p','z'], ['q','0'], ['r','1'], ['s','2'], ['t','3'], ['u','4'], ['v','5'], ['w','6'], ['x','7'], ['y','8'], ['z','9'] ]
l = list(str)
for i in range(len(l)):
for c in dicta:
if l[i] == c[0]:
l[i] = c[1]
break
return "".join(l)
Something like this should work (WARNING: untested code; may be full of mistakes):
import string
my_base64chars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/"
std_base64chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
s = s.translate(string.maketrans(my_base64chars, std_base64chars))
data = base64.b64decode(s)
It isn't possible to make the standard base64 functions (or the lower-level ones in binascii that they call) use a custom table.
You can use translate() and maketrans():
from string import maketrans
base64fixTable = maketrans("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/");
def correctbase64(str):
return str.translate(base64fixTable)
print "Hello Reverse Engineering!\n"
import string
import base64
my_base64chars = "WXYZlabcd3fghijko12e456789ABCDEFGHIJKL+/MNOPQRSTUVmn0pqrstuvwxyz"
std_base64chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
s = 'whatever encoded message you have that used my_base64chars index'
c = s.translate(string.maketrans(my_base64chars, std_base64chars))
data = base64.b64decode(c)
print (data)
Use maketrans to build a translation table and then translate from the first alphabet to the second. Then base64 decode.
import string
import base64
def decode(str):
#make a translation table.
table = string.maketrans(
#your alphabet
string.digits + string.uppercase + string.lowercase + "+/",
#the original alphabet
string.uppercase + string.lowercase + string.digits + "+/"
)
#translate
str.translate(s, table)
#finally decode
return base64.b64decode(str)
this will handle error TypeError: Incorrect padding
from string import maketrans
import base64
STANDARD_ALPHABET = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'
CUSTOM_ALPHABET = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/'
def correctbase64(input):
DECODE_TRANS = maketrans(CUSTOM_ALPHABET, STANDARD_ALPHABET)
newStr = input.translate(DECODE_TRANS)
# Add '=' char at the end of the string
newStr += '='
return base64.b64decode(newStr)
print custom_base64decode('x/Tcw/g') # hello

Categories