I need to perform custom escaping over a byte array in python. However, during escaping python converts bytes to integers, making performance optimization very difficult. How can I speed up my escaping function?
ESCAPE_DICT={
0x00: [0x5C,0x7A], # null -> \z 0x5c 0x7a
0x22: [0x5C,0x71], # " -> \q 0x5c 0x71
0x3B: [0x5C,0x73], # ; -> \s 0x5c 0x73
0x5C: [0x5C,0x5C], # \ -> \\ 0x5c 0x5c
0x0A: [0x5C,0x6E], # line-feed -> \n 0x5c 0x6e
0x0C: [0x5C,0x66], # form-feed -> \f 0x5c 0x66
0x0D: [0x5C,0x63], # carr-return -> \c 0x5c 0x63
}
def escape(string: bytes):
str_len=string.__len__()
escaped_list=[]
for i in range(0,str_len):
curr_byte=string[i]
escape = ESCAPE_DICT.get(curr_byte)
if escape is None:
# Don't escape current byte
escaped_list.append(curr_byte)
else:
# Escape current byte
escaped_list.extend(escape)
return bytes(escaped_array)
import re
ESCAPE_DICT = {
b'\x00': rb'\z', # null
b'"': rb'\q',
b';': rb'\s',
b'\\': rb'\\',
b'\n': rb'\n', # linefeed
b'\f': rb'\f', # formfeed
b'\r': rb'\c', # carriage return
}
ESCAPE_CLASS = '[' + ''.join(r'\x' + e.hex() for e in ESCAPE_DICT) + ']'
ESCAPE_REGEX = re.compile(ESCAPE_CLASS.encode())
def escape(string: bytes) -> bytes:
return re.sub(ESCAPE_REGEX, lambda m: ESCAPE_DICT[m.group(0)], string)
x = b'"abc\ndef\rpqr\x00stu\\xyz"'
y = escape(x)
from pprint import pprint
pprint(ESCAPE_CLASS)
pprint(ESCAPE_REGEX)
pprint(x)
pprint(y)
# =>
# '[\\x00\\x22\\x3b\\x5c\\x0a\\x0c\\x0d]'
# re.compile(b'[\\x00\\x22\\x3b\\x5c\\x0a\\x0c\\x0d]')
# b'"abc\ndef\rpqr\x00stu\\xyz"'
# b'\\qabc\\ndef\\cpqr\\zstu\\\\xyz\\q'
You can read the rb prefix as “raw bytes”.
Your escapes are a bit strange, though. E.g., the carriage return is normally \r, not \c, and \s normally stands for generic whitespace.
Related
I am working on migrating my project in python to golang and I have a use case for converting utf-8 encoding to corresponding gsm ones if possible. I am very new to go, it will be really helpful to get some documentation or examples around it.
For example: Python snippet
ằ as unicode -> a after gsm encoding
for character in text:
if is_gsm(character):
transliterated_text += character.encode('utf-8')
continue
if is_nonascii_utf8(character):
transliterated_char = unidecode.unidecode(character)
if transliterated_char == '?' or transliterated_char == '':
gsm = False
break
if transliterated_char != rc:
character = transliterated_char
transliterated_text += character
else:
transliterated_text += character.encode('utf-8')
if gsm and is_gsm(transliterated_text.decode('utf-8')):
text = transliterated_text.decode('utf-8')
Thanks
You can do it in this way:
package main
import (
"fmt"
"regexp"
"strings"
)
var utf8GsmChars = map[string]string{
`#`: "\x00", `£`: "\x01", `$`: "\x02",
`¥`: "\x03", `è`: "\x04", `é`: "\x05",
`ù`: "\x06", `ì`: "\x07", `ò`: "\x08",
`Ç`: "\x09", `Ø`: "\x0B", `ø`: "\x0C",
`Å`: "\x0E", `Δ`: "\x10", `_`: "\x11",
`Φ`: "\x12", `Γ`: "\x13", `Λ`: "\x14",
`Ω`: "\x15", `Π`: "\x16", `Ψ`: "\x17",
`Σ`: "\x18", `Θ`: "\x19", `Ξ`: "\x1A",
`Æ`: "\x1C", `æ`: "\x1D", `ß`: "\x1E",
`É`: "\x1F", `Ä`: "\x5B", `Ö`: "\x5C",
`Ñ`: "\x5D", `Ü`: "\x5E", `§`: "\x5F",
`¿`: "\x60", `ä`: "\x7B", `ö`: "\x7C",
`ñ`: "\x7D", `ü`: "\x7E", `à`: "\x7F",
`^`: "\x1B\x14", `{`: "\x1B\x28",
`}`: "\x1B\x29", `\`: "\x1B\x2F",
`[`: "\x1B\x3C", `~`: "\x1B\x3D",
`]`: "\x1B\x3E", `|`: "\x1B\x40",
`€`: "\x1B\x65",
}
var gsmUtf8Chars = map[string]string{
"\x00": "\x40",
"\x01": "\xC2\xA3",
"\x02": "\x24",
"\x03": "\xC2\xA5",
"\x04": "\xC3\xA8",
"\x05": "\xC3\xA9",
"\x06": "\xC3\xB9",
"\x07": "\xC3\xAC",
"\x08": "\xC3\xB2",
"\x09": "\xC3\x87",
"\x0B": "\xC3\x98",
"\x0C": "\xC3\xB8",
"\x0E": "\xC3\xB8",
"\x0F": "\xC3\xA5",
"\x10": "\xCE\x94",
"\x11": "\x5F",
"\x12": "\xCE\xA6",
"\x13": "\xCE\x93",
"\x14": "\xCE\xA0",
"\x15": "\xCE\xA9",
"\x16": "\xCE\xA0",
"\x17": "\xCE\xA8",
"\x18": "\xCE\xA3",
"\x19": "\xCE\x98",
"\x1A": "\xCE\x9E",
"\x1C": "\xC3\x86",
"\x1D": "\xC3\xA6",
"\x1E": "\xC3\x9F",
"\x1F": "\xC3\x89",
"\x20": "\x20",
"\x24": "\xC2\xA4",
"\x40": "\xC2\xA1",
"\x5B": "\xC3\x84",
"\x5C": "\xC3\x96",
"\x5D": "\xC3\x91",
"\x5E": "\xC3\x9C",
"\x5F": "\xC2\xA7",
"\x60": "\xC2\xBF",
"\x7B": "\xC3\xA8",
"\x7C": "\xC3\xB6",
"\x7D": "\xC3\xB1",
"\x7E": "\xC3\xBC",
"\x7F": "\xC3\xA0",
}
func UTF8ToGsm0338(text string) string {
var s string = text
for k, v := range utf8GsmChars {
s = strings.Replace(s, k, v, -1)
}
re := regexp.MustCompile("[\\x{0080}-\\x{10FFFF}]")
s = re.ReplaceAllString(s, "?")
return s
}
func GSM0338ToUTF8(text string) string {
var s string = text
for k, v := range gsmUtf8Chars {
s = strings.Replace(s, k, v, -1)
}
return s
}
func main() {
s := "Hello World"
gsm := UTF8ToGsm0338(s)
utf8 := GSM0338ToUTF8(gsm)
fmt.Printf("word before: %s\nword after gsm: %s\nword after utf8: %s\n", s, gsm, utf8)
}
I´m trying to port some Delphi code that sends data to a Universe database. In order to make the text legible by the DB we need to encode it in OEM.
In Delphi is done this way:
procedure TForm1.GenerarTablasNLS;
var
i: integer;
begin
for i := 0 to 255 do
begin
TablaUV_NLS[i] := AnsiChar(i);
TablaNLS_UV[i] := AnsiChar(i);
end;
// Nulo final
TablaUV_NLS[256] := #0;
TablaNLS_UV[256] := #0;
OemToCharA(#TablaUV_NLS[1], #TablaUV_NLS[1]);
CharToOemA(#TablaNLS_UV[1], #TablaNLS_UV[1]);
And then we translate our text simply like this
function StringToUniverse(const Value: string): AnsiString;
var
p: PChar;
q: PAnsiChar;
begin
SetLength(Result, Length(Value));
if Value = '' then Exit;
p := Pointer(Value);
q := Pointer(Result);
while p^ <> #0 do
begin
q^ := TablaNLS_UV[Ord(AnsiChar(p^))];
Inc(p);
Inc(q);
end;
end;
I follow the same logic in Python using a dictionary that stores each character translation
class StringUniverseDict(dict):
def __missing__(self, key):
return key
TablaString2UV = StringUniverseDict()
def rellenar_tablas_codificacion():
TablaString2UV['á'] = ' ' # chr(225) = chr(160)
TablaString2UV['é'] = '‚' # chr(233) = chr(130)
TablaString2UV['í'] = '¡' # chr(237) = chr(161)
TablaString2UV['ó'] = '¢' # chr(243) = chr(162)
TablaString2UV['ú'] = '£' # chr(250) = chr(163)
TablaString2UV['ñ'] = '¤' # chr(241) = chr(164)
TablaString2UV['ç'] = '‡' # chr(231) = chr(135)
TablaString2UV['Á'] = 'µ' # chr(193) = chr(181)
TablaString2UV['É'] = chr(144) # chr(201) = chr(144)
TablaString2UV['Í'] = 'Ö' # chr(205) = chr(214)
TablaString2UV['Ó'] = 'à' # chr(211) = chr(224)
TablaString2UV['Ñ'] = '¥' # chr(209) = chr(165)
TablaString2UV['Ç'] = '€' # chr(199) = chr(128)
TablaString2UV['ü'] = chr(129) # chr(252) = chr(129)
TablaString2UV[chr(129)] = '_' # chr(129) = chr(095)
TablaString2UV[chr(141)] = '_' # chr(141) = chr(095)
TablaString2UV['•'] = chr(007) # chr(149) = chr(007)
TablaString2UV['Å'] = chr(143) # chr(197) = chr(143)
TablaString2UV['Ø'] = chr(157) # chr(216) = chr(157)
TablaString2UV['ì'] = chr(141) # chr(236) = chr(141)
This works "fine" as long as I translate using printable characters. For example, the string
"á é í ó ú ñ ç Á Í Ó Ú Ñ Ç"
is translated, in Delphi, to the following bytes:
0xa0 0x20 0x82 0x20 0xa1 0x20 0xa2 0x20 0xa3 0x20 0xa4 0x20 0x87 0x20 0xb5 0x20 0xd6 0x20 0xe0 0x20 0xe9 0x20 0xa5 0x20 0x80 0xfe 0x73 0x64 0x73
(á translates to ' ', which is chr(160) or 0xA0 in hexa. é is '‚' or chr(130), 0x82 in hexa, í is '¡', char(161) or 0xA1 in hexa and so on)
In Python, when I try to encode this to OEM I do the following:
def convertir_string_a_universe(cadena_python):
resultado = ''
for letra in cadena_python:
resultado += TablaString2UV[letra]
return resultado
And then, to get the bytes
txt_registro = convertir_string_a_universe(txt_orig)
datos = bytes(txt_registro, 'cp1252')
With this I get the following bytes:
b'\xa0 \x82 \xa1 \xa2 \xa3 \xa4 \x87 \xb5 \xd6 \xe0 \xe9 \xa5 \x80 \x9a'
My problem is that this OEM encoding uses non-printable characters, like in 'É' = chr(144) (0x90 in hexa). If I try to call bytes(txt_registro, 'cp1252') with an array where I hava translated 'É' into chr(0x90) I get this error:
caracteres_mal = 'Éü'
txt_registro = convertir_string_a_universe(txt_orig)
datos = bytes(txt_registro, 'cp1252')
File "C:\Users\Hector\PyCharmProjects\pyuniverse\pyuniverse\UniverseRegister.py", line 138, in reconstruir_registro_universe
datos = bytes(txt_registro, 'cp1252')
File "C:\Users\Hector\AppData\Local\Programs\Python\Python36-32\lib\encodings\cp1252.py", line 12, in encode
return codecs.charmap_encode(input,errors,encoding_table)
UnicodeEncodeError: 'charmap' codec can't encode character '\x90' in position 0: character maps to <undefined>
How can I do this OEM encoding without raising this UnicodeEncodeError?
This is because cp1252 does not know about chr(0x90). If you try with utf-8 instead, it will work.
>>> chr(0x90).encode("utf8")
b'\xc2\x90'
I don't understand why you are trying to convert to cp1252 though: you have applied a custom conversion map and then, with bytes(txt_registro, 'cp1252'), you are converting your result again to cp1552.
I think what you are looking for is something like:
datos = bytes(txt_orig, 'uv')
where uv is your cutom codec.
So you would have to write an encoder and a decoder for it (which is basically what you have done already). Take a look at https://docs.python.org/3/library/codecs.html#codecs.register
to register a new codec. The function you will register with it should return a CodecInfo object described upper in the documentation.
import codecs
def buscar_a_uv(codec):
if codec == "uv":
return codecs.CodecInfo(
convertir_string_a_universe, convertir_universe_a_string, name="uv")
else:
return None
codecs.register(buscar_a_uv)
datos = bytes(txt_orig, 'uv')
EDIT
The encoder/decoder functions should return bytes, so you would need to update convertir_string_a_universe a bit.
I'm trying to remove trademark symbol (™) but only in the case it's not followed by any other symbol for instance I might have ’ which is a bad encoding of quotation mark (') so I don't want to remove trademark symbol (™) and hence broking the pattern that i'm using to replace xx™ with quotation mark.
dict = {};
chars = {
'\xe2\x84\xa2': '', # ™
'\xe2\x80\x99': "'", # ’
}
def stats_change(char, number):
if dict.has_key(char):
dict[char] = dict[char]+number
else:
dict[char] = number # Add new entry
def replace_chars(match):
char = match.group(0)
stats_change(char,1)
return chars[char]
i, nmatches = re.subn("(\\" + '|\\'.join(chars.keys()) + ")", replace_chars, i)
count_matches += nmatches
Input: foo™ oof
Output: foo oof
Input: o’f oof
Output: o'f oof
Any suggestions ?
I am new to Stackoverflow. I have searched for answer, but didn't find anything.
I have two Raspberry Pi 2B+, each with nRF24l01 connected. I found few libraries to make this connect, only one give any results, but not connections. This one: Github BLavery
I write script to send and to recv:
send.py:
import RPi.GPIO as GPIO
from lib_nrf24 import NRF24
import time
import spidev
GPIO.setmode(GPIO.BCM)
pipes = [[0xe7, 0xe7, 0xe7, 0xe7, 0xe7], [0xc2, 0xc2, 0xc2, 0xc2, 0xc2]]
radio = NRF24(GPIO, spidev.SpiDev())
radio.begin(0, 17)
radio.setPayloadSize(32)
radio.setChannel(0x60)
radio.setDataRate(NRF24.BR_2MBPS)
radio.setPALevel(NRF24.PA_MIN)
radio.setAutoAck(True)
radio.enableDynamicPayloads()
radio.enableAckPayload()
radio.openWritingPipe(pipes[1])
radio.printDetails()
while True:
message = list("Hello World")
radio.write(message)
print("We sent the message of {}".format(message))
# Check if it returned a ackPL
if radio.isAckPayloadAvailable():
returnedPL = []
radio.read(returnedPL, radio.getDynamicPayloadSize())
print("Our returned payload was {}".format(returnedPL))
else:
print("No payload received")
time.sleep(1)
recv.py:
import RPi.GPIO as GPIO
from lib_nrf24 import NRF24
import time
import spidev
GPIO.setmode(GPIO.BCM)
pipes = [[0xe7, 0xe7, 0xe7, 0xe7, 0xe7], [0xc2, 0xc2, 0xc2, 0xc2, 0xc2]]
radio = NRF24(GPIO, spidev.SpiDev())
radio.begin(0, 17)
radio.setPayloadSize(32)
radio.setChannel(0x60)
radio.setDataRate(NRF24.BR_2MBPS)
radio.setPAlevel(NRF24.PA_MIN)
radio.setAutoAck(True)
radio.enableDynamicPayloads()
radio.enableAckPayload()
radio.openReadingPipe(1, pipes[1])
radio.printDetails()
radio.startListening()
while True:
ackPL = [1]
while not radio.available (0):
time.sleep(1/100)
receivedMessage = []
radio.read(receivedMessage, radio.getDynamicPayloadSize())
print("Received: {}".format(receivedMessage))
print("Translating the receivedMessage into unicode characters...")
string = ""
for n in receivedMessage:
# Decode into standard i=unicode set
if (n >=32 and n <= 126):
string += chr(n)
print(string)
radio.writeAckPayload(1, ackPL, len(ackPL))
print("Loaded payload reply of {}".format(ackPL))
Everything seems to be alright, below are code returned by both scripts:
send:
STATUS = 0x03 RX_DR=0 TX_DS=0 MAX_RT=0 RX_P_NO=1 TX_FULL=1
RX_ADDR_P0-1 =
0xf8f8f8f8f8 0xf8f8f8f8f8
RX_ADDR_P2-5 =
0xf8
0xf9
0xf9
0xf9
TX_ADDR =
0xf8f8f8f8f8
RX_PW_P0-6 =
0x0c
0x00
0x00
0x00
0x00
0x00
EN_AA =
0x0f
EN_RXADDR =
0x00
RF_CH =
0x1c
RF_SETUP =
0x00
CONFIG =
0x03
DYNPD/FEATURE =
0x03
0x01
Data Rate = 1MBPS
Model = nRF24L01
CRC Length = Disabled
PA Power = PA_MIN
We sent the message of ['H', 'e', 'l', 'l', 'o', ' ', 'W', 'o', 'r', 'l', 'd']
No payload received
recv.py:
STATUS = 0x03 RX_DR=0 TX_DS=0 MAX_RT=0 RX_P_NO=1 TX_FULL=1
RX_ADDR_P0-1 =
0xf8f8f8f8f8 0xf8f8f8f8f8
RX_ADDR_P2-5 =
0xf8
0xf9
0xf9
0xf9
TX_ADDR =
0xf8f8f8f8f8
RX_PW_P0-6 =
0x0c
0x0c
0x00
0x00
0x00
0x00
EN_AA =
0x0f
EN_RXADDR =
0x00
RF_CH =
0x1c
RF_SETUP =
0x00
CONFIG =
0x03
DYNPD/FEATURE =
0x03
0x01
Data Rate = 1MBPS
Model = nRF24L01
CRC Length = Disabled
PA Power = PA_MIN
Received: []
Translating the receivedMessage into unicode characters...
Loaded payload reply of [1]
I don't really understand why it won't connect one to other,
Both have the same wiring:
nRF24L01-Raspberry Pi (Pin#)
GND - GND (6)
VCC - 3,3V (1)
CE - GPIO17 (11)
CSN - GPIO08(24)
SCK - GPIO11 (23)
MOSI - GPIO10 (19)
MISO - GPIO25 (22)
IRQ - unconnected
I need to send information from one RPi to second to control engine via PWM.
Can i ask for help
I need to replace some special characters from user input for different platform (i.e. Linux and Windows) using Python. Here is my code:
if request.method == 'POST':
rname1 = request.POST.get('react')
Here I am getting the user input by post method. I need to the following characters to remove from the user input (if there is any).
1- Escape or filter special characters for windows, ( ) < > * ‘ = ? ; [ ] ^ ~ ! . ” % # / \ : + , `
2- Escape or filter special characters for Linux, { } ( ) < > * ‘ = ? ; [ ] $ – # ~ ! . ” % / \ : + , `
The special characters are given above. Here I need to remove for both Linux and Windows.
Python strings have a built in method translate for substitution/deletion of characters. You need to build a translation table and then call the function.
import sys
if "win" in sys.platform:
special = """( ) < > * ‘ = ? ; [ ] ^ ~ ! . ” % # / \ : + , `""".split()
else:
special = """{ } ( ) < > * ‘ = ? ; [ ] $ – # ~ ! . ” % / \ : + , `""".split()
trans_dict = {character: None for character in special}
trans_table = str.maketrans(trans_dict)
print("Lo+=r?e~~m ipsum dol;or sit!! amet, consectet..ur ad%".translate(trans_table))
Will print Lorem ipsum dolor sit amet consectetur ad.
If you want to use a replacement character instead of deleting, then replace None above with the character. You can build a translation table with specific substitutions, `{"a": "m", "b": "n", ...}
Edit: The above snippet is indeed in Python3. In Python2 (TiO) it's easier to delete characters:
>>> import sys
>>> import string
>>> if "win" in sys.platform:
... special = """()<>*'=?;[]^~!%#/\:=,`"""
... else:
... special = """{}()<>*'=?;[]$-#~!."%/\:+"""
...
>>> s = "Lo+r?e~~/\#<>m ips()u;m"
>>> string.translate(s, None, special)
'Lorem ipsum'
Note that I've substituted ‘ with ' and similarly replaced ” with " because I think you're only dealing with ascii strings.