Codec Errors in Python - python

Does anyone know the name of a codec that can translate any random assortment of bytes into a string? I have been getting the following error after encoding, encrypting, and decoding a string in tkinter.Text.
UnicodeDecodeError: 'utf8' codec can't decode
byte 0x99 in position 151: unexpected code byte
Code used to generate the error follow below. The UTF8 codec listed at the top has problems translating some bytes back into a string. What I am looking for is an answer that solves the problem, not direction.
from tkinter import *
import traceback
from tkinter.scrolledtext import ScrolledText
CODEC = 'utf8'
################################################################################
class MarkovDemo:
def __init__(self, master):
self.prompt_size = Label(master, anchor=W, text='Encode Word Size')
self.prompt_size.pack(side=TOP, fill=X)
self.size_entry = Entry(master)
self.size_entry.insert(0, '8')
self.size_entry.pack(fill=X)
self.prompt_plain = Label(master, anchor=W, text='Plaintext Characters')
self.prompt_plain.pack(side=TOP, fill=X)
self.plain_entry = Entry(master)
self.plain_entry.insert(0, '""')
self.plain_entry.pack(fill=X)
self.showframe = Frame(master)
self.showframe.pack(fill=X, anchor=W)
self.showvar = StringVar(master)
self.showvar.set("encode")
self.showfirstradio = Radiobutton(self.showframe,
text="Encode Plaintext",
variable=self.showvar,
value="encode",
command=self.reevaluate)
self.showfirstradio.pack(side=LEFT)
self.showallradio = Radiobutton(self.showframe,
text="Decode Cyphertext",
variable=self.showvar,
value="decode",
command=self.reevaluate)
self.showallradio.pack(side=LEFT)
self.inputbox = ScrolledText(master, width=60, height=10, wrap=WORD)
self.inputbox.pack(fill=BOTH, expand=1)
self.dynamic_var = IntVar()
self.dynamic_box = Checkbutton(master, variable=self.dynamic_var,
text='Dynamic Evaluation',
offvalue=False, onvalue=True,
command=self.reevaluate)
self.dynamic_box.pack()
self.output = Label(master, anchor=W, text="This is your output:")
self.output.pack(fill=X)
self.outbox = ScrolledText(master, width=60, height=10, wrap=WORD)
self.outbox.pack(fill=BOTH, expand=1)
self.inputbox.bind('<Key>', self.reevaluate)
def select_all(event=None):
event.widget.tag_add(SEL, 1.0, 'end-1c')
event.widget.mark_set(INSERT, 1.0)
event.widget.see(INSERT)
return 'break'
self.inputbox.bind('<Control-Key-a>', select_all)
self.outbox.bind('<Control-Key-a>', select_all)
self.inputbox.bind('<Control-Key-/>', lambda event: 'break')
self.outbox.bind('<Control-Key-/>', lambda event: 'break')
self.outbox.config(state=DISABLED)
def reevaluate(self, event=None):
if event is not None:
if event.char == '':
return
if self.dynamic_var.get():
text = self.inputbox.get(1.0, END)[:-1]
if len(text) < 10:
return
text = text.replace('\n \n', '\n\n')
mode = self.showvar.get()
assert mode in ('decode', 'encode'), 'Bad mode!'
if mode == 'encode':
# Encode Plaintext
try:
# Evaluate the plaintext characters
plain = self.plain_entry.get()
if plain:
PC = eval(self.plain_entry.get())
else:
PC = ''
self.plain_entry.delete(0, END)
self.plain_entry.insert(0, '""')
# Evaluate the word size
size = self.size_entry.get()
if size:
XD = int(size)
while grid_size(text, XD, PC) > 1 << 20:
XD -= 1
else:
XD = 0
grid = 0
while grid <= 1 << 20:
grid = grid_size(text, XD, PC)
XD += 1
XD -= 1
# Correct the size and encode
self.size_entry.delete(0, END)
self.size_entry.insert(0, str(XD))
cyphertext, key, prime = encrypt_str(text, XD, PC)
except:
traceback.print_exc()
else:
buffer = ''
for block in key:
buffer += repr(block)[2:-1] + '\n'
buffer += repr(prime)[2:-1] + '\n\n' + cyphertext
self.outbox.config(state=NORMAL)
self.outbox.delete(1.0, END)
self.outbox.insert(END, buffer)
self.outbox.config(state=DISABLED)
else:
# Decode Cyphertext
try:
header, cypher = text.split('\n\n', 1)
lines = header.split('\n')
for index, item in enumerate(lines):
try:
lines[index] = eval('b"' + item + '"')
except:
lines[index] = eval("b'" + item + "'")
plain = decrypt_str(cypher, tuple(lines[:-1]), lines[-1])
except:
traceback.print_exc()
else:
self.outbox.config(state=NORMAL)
self.outbox.delete(1.0, END)
self.outbox.insert(END, plain)
self.outbox.config(state=DISABLED)
else:
text = self.inputbox.get(1.0, END)[:-1]
text = text.replace('\n \n', '\n\n')
mode = self.showvar.get()
assert mode in ('decode', 'encode'), 'Bad mode!'
if mode == 'encode':
try:
XD = int(self.size_entry.get())
PC = eval(self.plain_entry.get())
size = grid_size(text, XD, PC)
assert size
except:
pass
else:
buffer = 'Grid size will be:\n' + convert(size)
self.outbox.config(state=NORMAL)
self.outbox.delete(1.0, END)
self.outbox.insert(END, buffer)
self.outbox.config(state=DISABLED)
################################################################################
import random
CRYPT = random.SystemRandom()
################################################################################
# This section includes functions that
# can test the required key and bootstrap.
# sudoko_key
# - should be a proper "markov" key
def _check_sudoku_key(sudoku_key):
# Ensure key is a tuple with more than one item.
assert isinstance(sudoku_key, tuple), '"sudoku_key" must be a tuple'
assert len(sudoku_key) > 1, '"sudoku_key" must have more than one item'
# Test first item.
item = sudoku_key[0]
assert isinstance(item, bytes), 'first item must be an instance of bytes'
assert len(item) > 1, 'first item must have more than one byte'
assert len(item) == len(set(item)), 'first item must have unique bytes'
# Test the rest of the key.
for obj in sudoku_key[1:]:
assert isinstance(obj, bytes), 'remaining items must be of bytes'
assert len(obj) == len(item), 'all items must have the same length'
assert len(obj) == len(set(obj)), \
'remaining items must have unique bytes'
assert len(set(item)) == len(set(item).union(set(obj))), \
'all items must have the same bytes'
# boot_strap
# - should be a proper "markov" bootstrap
# - we will call this a "primer"
# sudoko_key
# - should be a proper "markov" key
def _check_boot_strap(boot_strap, sudoku_key):
assert isinstance(boot_strap, bytes), '"boot_strap" must be a bytes object'
assert len(boot_strap) == len(sudoku_key) - 1, \
'"boot_strap" length must be one less than "sudoku_key" length'
item = sudoku_key[0]
assert len(set(item)) == len(set(item).union(set(boot_strap))), \
'"boot_strap" may only have bytes found in "sudoku_key"'
################################################################################
# This section includes functions capable
# of creating the required key and bootstrap.
# bytes_set should be any collection of bytes
# - it should be possible to create a set from them
# - these should be the bytes on which encryption will follow
# word_size
# - this will be the size of the "markov" chains program uses
# - this will be the number of dimensions the "grid" will have
# - one less character will make up bootstrap (or primer)
def make_sudoku_key(bytes_set, word_size):
key_set = set(bytes_set)
blocks = []
for block in range(word_size):
blocks.append(bytes(CRYPT.sample(key_set, len(key_set))))
return tuple(blocks)
# sudoko_key
# - should be a proper "markov" key
def make_boot_strap(sudoku_key):
block = sudoku_key[0]
return bytes(CRYPT.choice(block) for byte in range(len(sudoku_key) - 1))
################################################################################
# This section contains functions needed to
# create the multidimensional encryption grid.
# sudoko_key
# - should be a proper "markov" key
def make_grid(sudoku_key):
grid = expand_array(sudoku_key[0], sudoku_key[1])
for block in sudoku_key[2:]:
grid = expand_array(grid, block)
return grid
# grid
# - should be an X dimensional grid from make_grid
# block_size
# - comes from length of one block in a sudoku_key
def make_decode_grid(grid, block_size):
cache = []
for part in range(0, len(grid), block_size):
old = grid[part:part+block_size]
new = [None] * block_size
key = sorted(old)
for index, byte in enumerate(old):
new[key.index(byte)] = key[index]
cache.append(bytes(new))
return b''.join(cache)
# grid
# - should be an X dimensional grid from make_grid
# block
# - should be a block from a sudoku_key
# - should have same unique bytes as the expanding grid
def expand_array(grid, block):
cache = []
grid_size = len(grid)
block_size = len(block)
for byte in block:
index = grid.index(bytes([byte]))
for part in range(0, grid_size, block_size):
cache.append(grid[part+index:part+block_size])
cache.append(grid[part:part+index])
return b''.join(cache)
################################################################################
# The first three functions can be used to check an encryption
# grid. The eval_index function is used to evaluate a grid cell.
# grid
# - grid object to be checked
# - grid should come from the make_grid function
# - must have unique bytes along each axis
# block_size
# - comes from length of one block in a sudoku_key
# - this is the length of one edge along the grid
# - each axis is this many unit long exactly
# word_size
# - this is the number of blocks in a sudoku_key
# - this is the number of dimensions in a grid
# - this is the length needed to create a needed markon chain
def check_grid(grid, block_size, word_size):
build_index(grid, block_size, word_size, [])
# create an index to access the grid with
def build_index(grid, block_size, word_size, index):
for number in range(block_size):
index.append(number)
if len(index) == word_size:
check_cell(grid, block_size, word_size, index)
else:
build_index(grid, block_size, word_size, index)
index.pop()
# compares the contents of a cell along each grid axis
def check_cell(grid, block_size, word_size, index):
master = eval_index(grid, block_size, index)
for axis in range(word_size):
for value in range(block_size):
if index[axis] != value:
copy = list(index)
copy[axis] = value
slave = eval_index(grid, block_size, copy)
assert slave != master, 'Cell not unique along axis!'
# grid
# - grid object to be accessed and evaluated
# - grid should come from the make_grid function
# - must have unique bytes along each axis
# block_size
# - comes from length of one block in a sudoku_key
# - this is the length of one edge along the grid
# - each axis is this many unit long exactly
# index
# - list of coordinates to access the grid
# - should be of length word_size
# - should be of length equal to number of dimensions in the grid
def eval_index(grid, block_size, index):
offset = 0
for power, value in enumerate(reversed(index)):
offset += value * block_size ** power
return grid[int(offset)]
################################################################################
# The following functions act as a suite that can ultimately
# encrpyt strings, though other functions can be built from them.
# bytes_obj
# - the bytes to encode
# byte_map
# - byte tranform map for inserting into the index
# grid
# - X dimensional grid used to evaluate markov chains
# index
# - list that starts the index for accessing grid (primer)
# - it should be of length word_size - 1
# block_size
# - length of each edge in a grid
def _encode(bytes_obj, byte_map, grid, index, block_size):
cache = bytes()
index = [0] + index
for byte in bytes_obj:
if byte in byte_map:
index.append(byte_map[byte])
index = index[1:]
cache += bytes([eval_index(grid, block_size, index)])
else:
cache += bytes([byte])
return cache, index[1:]
# bytes_obj
# - the bytes to encode
# sudoko_key
# - should be a proper "markov" key
# - this key will be automatically checked for correctness
# boot_strap
# - should be a proper "markov" bootstrap
def encrypt(bytes_obj, sudoku_key, boot_strap):
_check_sudoku_key(sudoku_key)
_check_boot_strap(boot_strap, sudoku_key)
# make byte_map
array = sorted(sudoku_key[0])
byte_map = dict((byte, value) for value, byte in enumerate(array))
# create two more arguments for encode
grid = make_grid(sudoku_key)
index = list(map(byte_map.__getitem__, boot_strap))
# run the actual encoding algorithm and create reversed map
code, index = _encode(bytes_obj, byte_map, grid, index, len(sudoku_key[0]))
rev_map = dict(reversed(item) for item in byte_map.items())
# fix the boot_strap and return the results
boot_strap = bytes(rev_map[number] for number in index)
return code, boot_strap
# string
# - should be the string that you want encoded
# word_size
# - length you want the markov chains to be of
# plain_chars
# - characters that you do not want to encrypt
def encrypt_str(string, word_size, plain_chars=''):
byte_obj = string.encode(CODEC)
encode_on = set(byte_obj).difference(set(plain_chars.encode()))
sudoku_key = make_sudoku_key(encode_on, word_size)
boot_strap = make_boot_strap(sudoku_key)
cyphertext = encrypt(byte_obj, sudoku_key, boot_strap)[0]
# return encrypted string, key, and original bootstrap
return cyphertext.decode(CODEC), sudoku_key, boot_strap
def grid_size(string, word_size, plain_chars):
encode_on = set(string.encode()).difference(set(plain_chars.encode()))
return len(encode_on) ** word_size
################################################################################
# The following functions act as a suite that can ultimately
# decrpyt strings, though other functions can be built from them.
# bytes_obj
# - the bytes to encode
# byte_map
# - byte tranform map for inserting into the index
# grid
# - X dimensional grid used to evaluate markov chains
# index
# - list that starts the index for accessing grid (primer)
# - it should be of length word_size - 1
# block_size
# - length of each edge in a grid
def _decode(bytes_obj, byte_map, grid, index, block_size):
cache = bytes()
index = [0] + index
for byte in bytes_obj:
if byte in byte_map:
index.append(byte_map[byte])
index = index[1:]
decoded = eval_index(grid, block_size, index)
index[-1] = byte_map[decoded]
cache += bytes([decoded])
else:
cache += bytes([byte])
return cache, index[1:]
# bytes_obj
# - the bytes to decode
# sudoko_key
# - should be a proper "markov" key
# - this key will be automatically checked for correctness
# boot_strap
# - should be a proper "markov" bootstrap
def decrypt(bytes_obj, sudoku_key, boot_strap):
_check_sudoku_key(sudoku_key)
_check_boot_strap(boot_strap, sudoku_key)
# make byte_map
array = sorted(sudoku_key[0])
byte_map = dict((byte, value) for value, byte in enumerate(array))
# create two more arguments for decode
grid = make_grid(sudoku_key)
grid = make_decode_grid(grid, len(sudoku_key[0]))
index = list(map(byte_map.__getitem__, boot_strap))
# run the actual decoding algorithm and create reversed map
code, index = _decode(bytes_obj, byte_map, grid, index, len(sudoku_key[0]))
rev_map = dict(reversed(item) for item in byte_map.items())
# fix the boot_strap and return the results
boot_strap = bytes(rev_map[number] for number in index)
return code, boot_strap
# string
# - should be the string that you want decoded
# word_size
# - length you want the markov chains to be of
# plain_chars
# - characters that you do not want to encrypt
def decrypt_str(string, sudoku_key, boot_strap):
byte_obj = string.encode(CODEC)
plaintext = decrypt(byte_obj, sudoku_key, boot_strap)[0]
# return encrypted string, key, and original bootstrap
return plaintext.decode(CODEC)
################################################################################
def convert(number):
"Convert bytes into human-readable representation."
assert 0 < number < 1 << 110, 'Number Out Of Range'
ordered = reversed(tuple(format_bytes(partition_number(number, 1 << 10))))
cleaned = ', '.join(item for item in ordered if item[0] != '0')
return cleaned
################################################################################
def partition_number(number, base):
"Continually divide number by base until zero."
div, mod = divmod(number, base)
yield mod
while div:
div, mod = divmod(div, base)
yield mod
def format_bytes(parts):
"Format partitioned bytes into human-readable strings."
for power, number in enumerate(parts):
yield '{} {}'.format(number, format_suffix(power, number))
def format_suffix(power, number):
"Compute the suffix for a certain power of bytes."
return (PREFIX[power] + 'byte').capitalize() + ('s' if number != 1 else '')
################################################################################
PREFIX = ' kilo mega giga tera peta exa zetta yotta bronto geop'.split(' ')
################################################################################
if __name__ == '__main__':
root = Tk()
root.title('Markov Demo')
demo = MarkovDemo(root)
root.mainloop()

Strings are by definition a sequence of bytes that only have meaning when interpreted with the knowledge of the encoding. That's one reason why the equivalent of Python 2's string type in Python 3 is the bytes type. As long as you know the encoding of the strings you're working with, I'm not sure you specifically need to recode it just to compress/encrypt it. Details of what you're actually doing might make a difference, though.

Python's decode has error settings. The default is strict which throws an exception.
Wherever you are doing the decoding, you can specify 'ignore' or 'replace' as a setting, and this will take care of your problems.
Please see the codecs documentation.

In Python HOWTOs from the Python v3.1.1 documentation, there is a helpful section regarding Unicode HOWTO. The table of content contains an entry to Python’s Unicode Support that explains string & byte.
The String Type
>>> b'\x80abc'.decode("utf-8", "strict")
Traceback (most recent call last):
File "<stdin>", line 1, in ?
UnicodeDecodeError: 'utf8' codec can't decode byte 0x80 in position 0:
unexpected code byte
>>> b'\x80abc'.decode("utf-8", "replace")
'\ufffdabc'
>>> b'\x80abc'.decode("utf-8", "ignore")
'abc'
Converting to Bytes
>>> u = chr(40960) + 'abcd' + chr(1972)
>>> u.encode('utf-8')
b'\xea\x80\x80abcd\xde\xb4'
>>> u.encode('ascii')
Traceback (most recent call last):
File "<stdin>", line 1, in ?
UnicodeEncodeError: 'ascii' codec can't encode character '\ua000' in
position 0: ordinal not in range(128)
>>> u.encode('ascii', 'ignore')
b'abcd'
>>> u.encode('ascii', 'replace')
b'?abcd?'
>>> u.encode('ascii', 'xmlcharrefreplace')
b'ꀀabcd޴'
One possible solution to the problem listed above involves covert all occurrences of
.encode(CODEC) with .encode(CODEC, 'ignore'). Likewise, all .decode(CODEC) become .decode(CODEC, 'ignore').

Related

code to compute the Merkle root for the block

I would like to use the below code to compute the Merkle root for the block. The detailed link is here
This is the block whose Merkle root I want to generate using given code.
I have this error:
*
File "merkle.py", line 126, in print(merkle(txHashes))
File "merkle.py", line 10, in merkle newHashList.append(hash2(hashList[i],
hashList[i+1])) File "merkle.py", line 18, in hash2 a1 = a.decode('hex')[::-1]
AttributeError: 'str' object has no attribute 'decode'
*
when I try to execute this code in python 3.8.2:
import hashlib
# Hash pairs of items recursively until a single value is obtained
def merkle(hashList):
if len(hashList) == 1:
return hashList[0]
newHashList = []
# Process pairs. For odd length, the last is skipped
for i in range(0, len(hashList)-1, 2):
newHashList.append(hash2(hashList[i], hashList[i+1]))
if len(hashList) % 2 == 1: # odd, hash last item twice
newHashList.append(hash2(hashList[-1], hashList[-1]))
return merkle(newHashList)
def hash2(a, b):
# Reverse inputs before and after hashing
# due to big-endian / little-endian nonsense
a1 = a.decode('hex')[::-1]
b1 = b.decode('hex')[::-1]
h = hashlib.sha256(hashlib.sha256(a1+b1).digest()).digest()
return h[::-1].encode('hex')
# https://blockexplorer.com/rawblock/0000000000000000e067a478024addfecdc93628978aa52d91fabd4292982a50
txHashes = [
"00baf6626abc2df808da36a518c69f09b0d2ed0a79421ccfde4f559d2e42128b",
"91c5e9f288437262f218c60f986e8bc10fb35ab3b9f6de477ff0eb554da89dea",
"46685c94b82b84fa05b6a0f36de6ff46475520113d5cb8c6fb060e043a0dbc5c",
"ba7ed2544c78ad793ef5bb0ebe0b1c62e8eb9404691165ffcb08662d1733d7a8",
"b8dc1b7b7ed847c3595e7b02dbd7372aa221756b718c5f2943c75654faf48589",
"25074ef168a061fcc8663b4554a31b617683abc33b72d2e2834f9329c93f8214",
"0fb8e311bffffadc6dc4928d7da9e142951d3ba726c8bde2cf1489b62fb9ebc5",
"c67c79204e681c8bb453195db8ca7d61d4692f0098514ca198ccfd1b59dbcee3",
"bd27570a6cbd8ad026bfdb8909fdae9321788f0643dea195f39cd84a60a1901b",
"41a06e53ffc5108358ddcec05b029763d714ae9f33c5403735e8dee78027fe74",
"cc2696b44cb07612c316f24c07092956f7d8b6e0d48f758572e0d611d1da6fb9",
"8fc508772c60ace7bfeb3f5f3a507659285ea6f351ac0474a0a9710c7673d4fd",
"62fed508c095446d971580099f976428fc069f32e966a40a991953b798b28684",
"928eadbc39196b95147416eedf6f635dcff818916da65419904df8fde977d5db",
"b137e685df7c1dffe031fb966a0923bb5d0e56f381e730bc01c6d5244cfe47c1",
"b92207cee1f9e0bfbd797b05a738fab9de9c799b74f54f6b922f20bd5ec23dd6",
"29d6f37ada0481375b6903c6480a81f8deaf2dcdba03411ed9e8d3e5684d02dd",
"48158deb116e4fd0429fbbbae61e8e68cb6d0e0c4465ff9a6a990037f88c489c",
"be64ea86960864cc0a0236bbb11f232faf5b19ae6e2c85518628f5fae37ec1ca",
"081363552e9fff7461f1fc6663e1abd0fb2dd1c54931e177479a18c4c26260e8",
"eb87c25dd2b2537b1ff3dbabc420e422e2a801f1bededa6fa49ef7980feaef70",
"339e16fcc11deb61ccb548239270af43f5ad34c321416bada4b8d66467b1c697",
"4ad6417a3a04179482ed2e4b7251c396e38841c6fba8d2ce9543337ab7c93c02",
"c28a45cded020bf424b400ffc9cb6f2f85601934f18c34a4f78283247192056a",
"882037cc9e3ee6ddc2d3eba86b7ca163533b5d3cbb16eaa38696bb0a2ea1137e",
"179bb936305b46bb0a9df330f8701984c725a60e063ad5892fa97461570b5c04",
"9517c585d1578cb327b7988f38e1a15c663955ea288a2292b40d27f232fbb980",
"2c7e07d0cf42e5520bcbfe2f5ef63761a9ab9d7ccb00ea346195eae030f3b86f",
"534f631fc42ae2d309670e01c7a0890e4bfb65bae798522ca14df09c81b09734",
"104643385619adb848593eb668a8066d1f32650edf35e74b0fc3306cb6719448",
"87ac990808239c768182a752f4f71cd98558397072883c7e137efb49d22b9231",
"9b3e2f1c47d59a444e9b6dc725f0ac6baf160d22f3a9d399434e5e65b14eccb0",
"fbe123066ae5add633a542f151663db4eb5a7053e388faadb40240671ae1b09b",
"1dd07e92e20b3cb9208af040031f7cfc4efd46cc31ec27be20a1047965a42849",
"2709bb9ed27353c1fd76b9240cab7576a44de68945e256ad44b2cb8d849a8060",
"d0174db2c712573432a7869c1508f371f3a1058aeedddc1b53a7e04d7c56c725",
"b4a16f724cddb8f77ddf3d2146a12c4be13d503885eaba3518a03da005009f62",
"2aa706d75decbe57745e01d46f9f5d30a08dedaf3288cee14cc4948e3684e1d4",
"ee49c5f6a5129ccaf2abebbc1d6d07a402a600af6221476b89aafaa683ca95b7",
"bea1011c77874845e9b4c876ed2ceebd530d428dd4a564ad003d9211d40bb091",
"f1e88ffc2b1de2aa4827002f06943ce5468735f7433f960bf01e75885b9f832b",
"19247d017e002fb9143d1a89eb921222a94f8a3d0faaf2e05b0f594989edc4c4",
"13f714ff62ee7d26b6d69ca980c141ebc54e9f71d2697083fe6c5efc1b02bd0f",
"0c78cbb8246572f015fbdc53dc9798fa54d1119ec77c1f07ac310bcbcc40dbf8",
"4bcde0ef92a6d24a2be7be50ac5e5299d776df2e6229ba5d475c2491da94f255",
"0cfd7d1058502730cf0b2ffa880c78ef534651e06832b5d87c0d7eb84eac5b0c",
"3a168f794d6e0c614429ad874317cc4cd67a8177214880ff6ea1704d29228c2f",
"f9a555d817334397b402518d6fd959dc73d981ee7f5fe67969b63974ebbef127",
"24b52691f66eaed4ce391a473902e309018257c98b9f02aaa33b399c9e6f3168",
"a37b5e623dc26a180d9e2c9510d06885b014e86e533adb63ec40511e10b55046",
"9dbaeb485e51d9e25a5621dc46e0bc0aaf51fb26be5acc4e370b96f62c469b80",
"a6431d3d39f6c38c5df48405090752cab03bfdf5c77cf881b18a946807fba74a",
"faa77e309f125373acf19855dd496fffe2f74962e545420844557a3adc7ebc11",
"3523f52543ecfea2f78486dc91550fad0e6467d46d9d9c82ca63b2e0230bfa71",
"a0583e358e42d77d18d1fd0533ff0a65615fc3b3112061ef92f168a00bf640c1",
"42ae900888d5e5dde59c8e3d06e13db9e84ef05d27726d4b67fd00c50cd9406a",
"154940777d3ff78f592ef02790131a59263c36b4958bbc836f9a767ea1a9f178",
"6a0337de6ac75eecf748306e8ebc5bfe5c811a1481ae50f6956a9e7f26a679f5",
"c99530c2148e09688d0b88795625943371183bf1f5d56c7446c6ed51ea133589",
"626421dbe8ad6a0fd0d622d5dd3308a1cdc00b98575a41a91fe01a439e6f40bd",
"b2f3a559f605a158cc395126c3cf394a7e92a53b7514c75157e1dc43a6c7f93e",
"dffe06d1bea81f2a01c76786404bb867258f9e68013bf25454097ce935090738",
"0860159ec7a2a51ce107c182a988c40b4bc2057a734354a1219b6c65e72640ed",
"a405ff1bb51846b1867acc0b0da17f6f9616e592a0a7ff5ef3297c1ecfd60911",
"a7d451924263284765f6343bca8a21b79b89ebfe611c7355dd88e0ec1c29e232",
"41c758d08a4d3fe4d90645711589b832a2cd54dd25bd5b66e463e5d389a53aff",
"a05c1a93a521fa5dbc1790cfbb808893453a428a65f2c6b2d51249fbb12db309",
"90997920aa9786e10f513cfdd14e294feee6739cee1ab61b3fb1e3f42e7a915d",
"99fcb9cb62c20a3135484a70bd3f73983f8f3b7b26266dad34f3993958a7642c",
"e05f9a668b37e5f78bd3b9d047f29f92b33a87f11dd48390410006f858188b7b",
"56dbc65895f7992da4a6985e7edba4d1c00879f1b28442c644c8a07658ceab27",
"5e9004fe262b829563d0804656ba68b1de1690401f08a1915273230d8c902fc0",
"1ea9ed3717523c5e304b7a7ac8058a87fb4f3fed8c6004769f226c9bb67e79c5",
"f0f1a4c009b3f1b2729e89898e2f5c0fcdc312edea5df884a9c897cb90e4c566",
"b5bb4ddf04863e6a60f33cb96c20dac8175d3bae55f335781503143c97a50e43",
"f14cc97a20c6f627b4b78301352ae35463bc359362589cd178a06c0fa90850b7",
"628801c8f614015c0fa0ccb2768cccc3e7b9d41ceed06071ce2534d31f7236d6",
"3be1013c8f8da150e2195408093153b55b08b037fd92db8bb5e803f4c2538aae",
"c9e1f8777685f54ba65c4e02915fd649ee1edcbf9c77ddf584b943d27efb86c3",
"4274e92ed3bd02eb101baa5fb8ff7b96236830762d08273749fbb5166db8ab0b",
"aa84c955bea04c7cee8f5bbbec97d25930fcaca363eed1b8cad37b931556d3e3",
"d6a29c948677fb1f71aaf16debc3d071a4dd349458eb9e056dce3a000ff853da",
"ba84bdb3d78367ca365016ac4bff9269576eb010f874c2967af73e0de5638de0",
"1546c79951e3b541bc64d1957b565b7a2850fc87192c7b374aee6cfc69b9805e",
"f119227d492ebe27fe9aae321980802454dfa64b2691efbe796c5075d5b07f62",
"b8cf13d64818b32f96bbb585998b1bc9505f6a94055488e5a71fee9479c6f2a9",
"1aaf459705b6afef2d7b83e3f181f1af55be0813daf55edce104cc59abc28ed7",
"61ac185c8f520b5e3134953dc52ff292a40e1e96b088dab259558a9d240ec02f",
"2da96e3154d7ec2329f787b73cb8a436b92d64cf3cc28e920d073279ea73b5f8",
"1c4d72ce733b971b9ec4e24f37d733355f6f2ea635cc67ffb3e22748484df446",
"2a6f89769f3272ac8c7a36a42a57627eca6b260ab2c76d8046a27d44d4034893",
"f8d11df51a2cc113698ebf39a958fe81179d7d973d2044322771c0fe63f4d7c9",
"f2287f17a4fa232dca5715c24a92f7112402a8101b9a7b276fb8c8f617376b90",
"bb5ee510a4fda29cae30c97e7eee80569d3ec3598465f2d7e0674c395e0256e9",
"647ab8c84365620d60f2523505d14bd230b5e650c96dee48be47770063ee7461",
"34b06018fcc33ba6ebb01198d785b0629fbdc5d1948f688059158f053093f08b",
"ff58b258dab0d7f36a2908e6c75229ce308d34806289c912a1a5f39a5aa71f9f",
"232fc124803668a9f23b1c3bcb1134274303f5c0e1b0e27c9b6c7db59f0e2a4d",
"27a0797cc5b042ba4c11e72a9555d13a67f00161550b32ede0511718b22dbc2c",
]
print (merkle(txHashes))
Note: I am new to "blockchain, encoding and decoding in python"
In Python 3
You can do this
def hash2(a, b):
# Reverse inputs before and after hashing
# due to big-endian / little-endian nonsense
a1 = a[::-1]
b1 = b[::-1]
contcat = a1+b1
h = hashlib.sha256(hashlib.sha256(b"concat").digest()).digest()
return h
Not sure why it applies twice sha256.
To get the result in hex, you return with .hexdigest() as hashlib.sha256(hashlib.sha256(b"concat").digest()).hexdigest().
Again Not sure why we need twice to apply sha256

adding MIDI chords at specific MetaMessage time

I have a MIDI file with marker as meta-messages.
fname = "avm.mid"
mid = MidiFile(fname) # input file of type 0
metas = [m for m in mid.tracks[0] if m.is_meta]
I have stored the meta marker times in the list "chordTimes". The first maker ( chord position) does not start at 0. I make a new MIDI file:
mo = MidiFile(type =1)# output file
track = MidiTrack()
Now I read through my list of desired chords and and add them to a new track to be added to mo.
for i in range(0, len(chords)-1):
chordInfo = chordMidiNotes(chords[i], extraBass= False)
chordNotes = chordInfo[0]
# append to track note on messages
if i != 0:
for note_value in chordNotes: # result has chord notes
track.append(Message('note_on', note=note_value, velocity=100, time=0))
else:
for j,note_value in enumerate(chordNotes): # result has chord notes
if j == 0:
track.append(Message('note_on', note=note_value, velocity=100, time=chordTimes[0]))
else:
track.append(Message('note_on', note=note_value, velocity=100, time=0))
# append to track note off messages
for k,note_value in enumerate(chordNotes): # result has chord notes
if k == 0:
track.append(Message('note_off', note=note_value, velocity=127, time=chordTimes[i+1]))
else:
track.append(Message('note_off', note=note_value, velocity=127, time=0))
# now adding track to output file
mo.tracks.append(mid.tracks[0])
mo.tracks.append(track)
mo.save("songWithChords.mid")
But when I display it, the chords seem to be incorrect and in the wrong position and look much longer ( about 3 times) than the original one. I checked the header chunk of the output:
<meta message time_signature numerator=4 denominator=4 clocks_per_click=24 notated_32nd_notes_per_beat=8 time=0>,
Any help would be greatly appreciated.

Converting RIJNDAEL 256 function from .Net to Python

Can someone please help me with converting this code?
I tried to make RIJNDAEL256 function out of this code:
EncryptRJ256("lkirwf897+22#bbtrm8814z5qq=498j5", "741952hheeyy66#cs!9hjv887mxx7#8y", "A padded string to BLOCKSIZE length.")
Public Function EncryptRJ256(ByVal prm_key As String, ByVal prm_iv As String, ByVal prm_text_to_encrypt As String) As String
Dim s As String = prm_text_to_encrypt
Dim managed2 As New RijndaelManaged With {
.Padding = PaddingMode.Zeros,
.Mode = CipherMode.CBC,
.BlockSize = 256
}
Dim stream As New MemoryStream
Dim stream2 As New CryptoStream(stream, managed2.CreateEncryptor(Encoding.ASCII.GetBytes(prm_key), Encoding.ASCII.GetBytes(prm_iv)), CryptoStreamMode.Write)
Dim bytes As Byte() = Encoding.ASCII.GetBytes(s)
stream2.Write(bytes, 0, bytes.Length)
stream2.FlushFinalBlock()
Return Convert.ToBase64String(stream.ToArray)
End Function
I need the output of the encrypted string to be something like this:
Dv0Y/AFXdFMlDrcldFCu8v5o9zAlLNgyM+vO+PFeSrpO8Ve82mdUcc4rkzp9afDYc75NmkSd4mdflt38kceOdA==
A padded string to BLOCKSIZE length
I came up with this but the output is invalid. It's probably because of wrong padding but I have no idea how to fix it:
from rijndael.cipher.crypt import new
from rijndael.cipher.blockcipher import MODE_CBC
import base64
PADDING = b'.'
def r_pad(payload, block_size=32):
return payload + (block_size - len(payload) % block_size) * PADDING
KEY = 'lkirwf897+22#bbtrm8814z5qq=498j5'
IV = '741952hheeyy66#cs!9hjv887mxx7#8y'
plain_text = "A padded string to BLOCKSIZE length."
rjn = new(KEY, MODE_CBC, IV, blocksize=32)
encd = rjn.encrypt(r_pad(plain_text))
data = base64.b64encode(encd)
print(data)
rjn = new(KEY, MODE_CBC, IV, blocksize=32)
data = base64.b64decode(data)
decd = rjn.decrypt(r_pad(data))
print (decd)
This is the output:
Dv0Y/AFXdFMlDrcldFCu8v5o9zAlLNgyM+vO+PFeSrqWdzP1S1cumviFiEjNAjz5njnMMC9lfxsBl71x5y+xCw==
A padded string to BLOCKSIZE length.............................Å¿:è°⌐┘n┤«╞Px╜:æC┬♣╬Q┤▼«U_♦â☻ìr
How silly of me! Here's what fixed my issue:
from rijndael.cipher.crypt import new
from rijndael.cipher.blockcipher import MODE_CBC
import base64
PADDING = b'\x00'
def pad(payload, block_size=32):
return payload + (block_size - len(payload) % block_size) * PADDING
def un_pad(payload):
return payload.replace(PADDING, '')
KEY = 'lkirwf897+22#bbtrm8814z5qq=498j5'
IV = '741952hheeyy66#cs!9hjv887mxx7#8y'
plain_text = "A padded string to BLOCKSIZE length."
rjn = new(KEY, MODE_CBC, IV, blocksize=32)
encd = rjn.encrypt(pad(plain_text))
data = base64.b64encode(encd)
print(data)
rjn = new(KEY, MODE_CBC, IV, blocksize=32)
data = base64.b64decode(data)
decd = rjn.decrypt(un_pad(data))
print (decd)

Read a binary file *.SRS (Solar Radio Spectrograph)

I wanna read a binary file (K7120127.SRS) with caractristhics detailed in the word file (Documentacion SRS DATA.doc) ,2.2. chapter, that adding in the next link
https://drive.google.com/folderview?id=0B_NlxFaQkpgHb00yTm5kU0MyaUU&usp=sharing
In the link is included a viewer of that data (Srsdisp.exe), but i wanna process this data not only view it, that's why I'd want to read it in Python.
I know plot using matplotlib, but work with binary files is new for me. I 'd wanna plot something like this (That plot was made using the viewer included in the link)
Try that.
from struct import unpack
# constants from the file spec
RECORD_SIZE=826
RECORD_HEADER_SIZE=24
RECORD_ARRAY_SIZE=401
# verbosity values
VERBOSITY_ALL = 2 # print warnings and errors
VERBOSITY_ERRORS = 1 # print errors
VERBOSITY_NONE = 0 # print nothing
class SRSRecord:
"""Holds one 826 byte SRS Record."""
_site_to_name = {
1: "Palehua",
2: "Holloman",
3: "Learmonth",
4: "San Vito",
# add new site names here ..
}
def __init__(self):
self.year = None
self.month = None
self.day = None
self.hour = None
self.minute = None
self.seconds = None
self.site_number = None
self.site_name = None
self.n_bands_per_record = None
self.a_start_freq = None
self.a_end_freq = None
self.a_num_bytes = None
self.a_analyser_reference_level = None
self.a_analyser_attenuation = None
self.b_start_freq = None
self.b_end_freq = None
self.b_num_bytes = None
self.b_analyser_reference_level = None
self.b_analyser_attenuation = None
# dictionary that maps frequency in mega hertz to level
self.a_values = {}
# dictionary that maps frequency in mega hertz to level
self.b_values = {}
return
def _parse_srs_file_header(self, header_bytes, verbosity = VERBOSITY_ALL):
fields = unpack(
# General header information
'>' # (data packed in big endian format)
'B' # 1 Year (last 2 digits) Byte integer (unsigned)
'B' # 2 Month number (1 to 12) "
'B' # 3 Day (1 to 31) "
'B' # 4 Hour (0 to 23 UT) "
'B' # 5 Minute (0 to 59) "
'B' # 6 Second at start of scan (0 to 59) "
'B' # 7 Site Number (0 to 255) "
'B' # 8 Number of bands in the record (2) "
# Band 1 (A-band) header information
'h' # 9,10 Start Frequency (MHz) Word integer (16 bits)
'H' # 11,12 End Frequency (MHz) "
'H' # 13,14 Number of bytes in data record (401) "
'B' # 15 Analyser reference level Byte integer
'B' # 16 Analyser attenuation (dB) "
# Band 2 (B-band) header information
# 17-24 As for band 1
'H' # 17,18 Start Frequency (MHz) Word integer (16 bits)
'H' # 19,20 End Frequency (MHz) "
'H' # 21,22 Number of bytes in data record (401) "
'B' # 23 Analyser reference level Byte integer
'B', # 24 Analyser attenuation (dB) "
header_bytes)
self.year = fields[0]
self.month = fields[1]
self.day = fields[2]
self.hour = fields[3]
self.minute = fields[4]
self.seconds = fields[5]
# read the site number and work out the site name
self.site_number = fields[6]
if self.site_number not in SRSRecord._site_to_name.keys():
# got an unknown site number.. complain a bit..
if verbosity >= VERBOSITY_ALL:
print("Unknown site number: %s" % self.site_number)
print("A list of known site numbers follows:")
for site_number, site_name in SRSRecord._site_to_name.items():
print("\t%s: %s" % (site_number, site_name))
# then set the site name to unknown.
self.site_name = "UnknownSite"
else:
# otherwise look up the site using our lookup table
self.site_name = SRSRecord._site_to_name[self.site_number]
# read the number of bands
self.n_bands_per_record = fields[7] # should be 2
if self.n_bands_per_record != 2 and verbosity >= VERBOSITY_ERRORS:
print("Warning.. record has %s bands, expecting 2!" % self.n_bands_per_record)
# read the a record meta data
self.a_start_freq = fields[8]
self.a_end_freq = fields[9]
self.a_num_bytes = fields[10]
if self.a_num_bytes != 401 and verbosity >= VERBOSITY_ERRORS:
print("Warning.. record has %s bytes in the a array, expecting 401!" %
self.a_num_bytes)
self.a_analyser_reference_level = fields[11]
self.a_analyser_attenuation = fields[12]
# read the b record meta data
self.b_start_freq = fields[13]
self.b_end_freq = fields[14]
self.b_num_bytes = fields[15]
if self.b_num_bytes != 401 and verbosity >= VERBOSITY_ERRORS:
print("Warning.. record has %s bytes in the b array, expecting 401!" %
self.b_num_bytes)
self.b_analyser_reference_level = fields[16]
self.b_analyser_attenuation = fields[17]
return
def _parse_srs_a_levels(self, a_bytes):
# unpack the frequency/levels from the first array
for i in range(401):
# freq equation from the srs file format spec
freq_a = 25 + 50 * i / 400.0
level_a = unpack('>B', a_bytes[i])[0]
self.a_values[freq_a] = level_a
return
def _parse_srs_b_levels(self, b_bytes):
for i in range(401):
# freq equation from the srs file format spec
freq_b = 75 + 105 * i / 400.0
level_b = unpack('>B', b_bytes[i])[0]
self.b_values[freq_b] = level_b
return
def __str__(self):
return ("%s/%s/%s, %s:%s:%s site: %s/%s bands: %s "
"[A %s->%s MHz ref_level: %s atten: %s dB], "
"[B %s->%s MHz ref_level: %s atten: %s dB]"
)% (
self.day, self.month, self.year,
self.hour, self.minute, self.seconds,
self.site_number, self.site_name,
self.n_bands_per_record,
self.a_start_freq, self.a_end_freq,
self.a_analyser_reference_level, self.a_analyser_attenuation,
self.b_start_freq, self.b_end_freq,
self.b_analyser_reference_level, self.b_analyser_attenuation,
)
def _dump(self, values):
freqs = values.keys()
freqs.sort()
for freq in freqs:
print "%5s %s" % (freq, values[freq])
return
def dump_a(self):
self._dump(self.a_values)
return
def dump_b(self):
self._dump(self.b_values)
return
def read_srs_file(fname):
"""Parses an srs file and returns a list of SRSRecords."""
# keep the records we read in here
srs_records = []
f = open(fname, "rb")
while True:
# read raw record data
record_data = f.read(RECORD_SIZE)
# if the length of the record data is zero we've reached the end of the data
if len(record_data) == 0:
break
# break up the record bytes into header, array a and array b bytes
header_bytes = record_data[:RECORD_HEADER_SIZE]
a_bytes = record_data[RECORD_HEADER_SIZE : RECORD_HEADER_SIZE + RECORD_ARRAY_SIZE]
b_bytes = record_data[RECORD_HEADER_SIZE + RECORD_ARRAY_SIZE :
RECORD_HEADER_SIZE + 2 * RECORD_ARRAY_SIZE]
# make a new srs record
record = SRSRecord()
record._parse_srs_file_header(header_bytes, verbosity = VERBOSITY_ERRORS)
record._parse_srs_a_levels(a_bytes)
record._parse_srs_b_levels(b_bytes)
srs_records.append(record)
return srs_records
if __name__ == "__main__":
# parse the file.. (this is where the magic happens ;)
srs_records = read_srs_file(fname = "K7120127.SRS")
# play with the data
for i in range(3):
print srs_records[i]
r0 = srs_records[0]
r0.dump_a()
r0.dump_b()

Python truncate a long string

How does one truncate a string to 75 characters in Python?
This is how it is done in JavaScript:
var data="saddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddsaddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddsadddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"
var info = (data.length > 75) ? data.substring[0,75] + '..' : data;
info = (data[:75] + '..') if len(data) > 75 else data
Even more concise:
data = data[:75]
If it is less than 75 characters there will be no change.
Even shorter :
info = data[:75] + (data[75:] and '..')
If you are using Python 3.4+, you can use textwrap.shorten from the standard library:
Collapse and truncate the given text to fit in the given width.
First the whitespace in text is collapsed (all whitespace is replaced
by single spaces). If the result fits in the width, it is returned.
Otherwise, enough words are dropped from the end so that the remaining
words plus the placeholder fit within width:
>>> textwrap.shorten("Hello world!", width=12)
'Hello world!'
>>> textwrap.shorten("Hello world!", width=11)
'Hello [...]'
>>> textwrap.shorten("Hello world", width=10, placeholder="...")
'Hello...'
For a Django solution (which has not been mentioned in the question):
from django.utils.text import Truncator
value = Truncator(value).chars(75)
Have a look at Truncator's source code to appreciate the problem:
https://github.com/django/django/blob/master/django/utils/text.py#L66
Concerning truncation with Django:
Django HTML truncation
With regex:
re.sub(r'^(.{75}).*$', '\g<1>...', data)
Long strings are truncated:
>>> data="11111111112222222222333333333344444444445555555555666666666677777777778888888888"
>>> re.sub(r'^(.{75}).*$', '\g<1>...', data)
'111111111122222222223333333333444444444455555555556666666666777777777788888...'
Shorter strings never get truncated:
>>> data="11111111112222222222333333"
>>> re.sub(r'^(.{75}).*$', '\g<1>...', data)
'11111111112222222222333333'
This way, you can also "cut" the middle part of the string, which is nicer in some cases:
re.sub(r'^(.{5}).*(.{5})$', '\g<1>...\g<2>', data)
>>> data="11111111112222222222333333333344444444445555555555666666666677777777778888888888"
>>> re.sub(r'^(.{5}).*(.{5})$', '\g<1>...\g<2>', data)
'11111...88888'
limit = 75
info = data[:limit] + '..' * (len(data) > limit)
This method doesn't use any if:
data[:75] + bool(data[75:]) * '..'
This just in:
n = 8
s = '123'
print s[:n-3] + (s[n-3:], '...')[len(s) > n]
s = '12345678'
print s[:n-3] + (s[n-3:], '...')[len(s) > n]
s = '123456789'
print s[:n-3] + (s[n-3:], '...')[len(s) > n]
s = '123456789012345'
print s[:n-3] + (s[n-3:], '...')[len(s) > n]
123
12345678
12345...
12345...
info = data[:75] + ('..' if len(data) > 75 else '')
info = data[:min(len(data), 75)
You can't actually "truncate" a Python string like you can do a dynamically allocated C string. Strings in Python are immutable. What you can do is slice a string as described in other answers, yielding a new string containing only the characters defined by the slice offsets and step.
In some (non-practical) cases this can be a little annoying, such as when you choose Python as your interview language and the interviewer asks you to remove duplicate characters from a string in-place. Doh.
Yet another solution. With True and False you get a little feedback about the test at the end.
data = {True: data[:75] + '..', False: data}[len(data) > 75]
Coming very late to the party I want to add my solution to trim text at character level that also handles whitespaces properly.
def trim_string(s: str, limit: int, ellipsis='…') -> str:
s = s.strip()
if len(s) > limit:
return s[:limit-1].strip() + ellipsis
return s
Simple, but it will make sure you that hello world with limit=6 will not result in an ugly hello … but hello… instead.
It also removes leading and trailing whitespaces, but not spaces inside. If you also want to remove spaces inside, checkout this stackoverflow post
>>> info = lambda data: len(data)>10 and data[:10]+'...' or data
>>> info('sdfsdfsdfsdfsdfsdfsdfsdfsdfsdfsdf')
'sdfsdfsdfs...'
>>> info('sdfsdf')
'sdfsdf'
>>>
Simple and short helper function:
def truncate_string(value, max_length=255, suffix='...'):
string_value = str(value)
string_truncated = string_value[:min(len(string_value), (max_length - len(suffix)))]
suffix = (suffix if len(string_value) > max_length else '')
return string_truncated+suffix
Usage examples:
# Example 1 (default):
long_string = ""
for number in range(1, 1000):
long_string += str(number) + ','
result = truncate_string(long_string)
print(result)
# Example 2 (custom length):
short_string = 'Hello world'
result = truncate_string(short_string, 8)
print(result) # > Hello...
# Example 3 (not truncated):
short_string = 'Hello world'
result = truncate_string(short_string)
print(result) # > Hello world
If you wish to do some more sophisticated string truncate you can adopt sklearn approach as implement by:
sklearn.base.BaseEstimator.__repr__
(See Original full code at: https://github.com/scikit-learn/scikit-learn/blob/f3f51f9b6/sklearn/base.py#L262)
It adds benefits such as avoiding truncate in the middle of the word.
def truncate_string(data, N_CHAR_MAX=70):
# N_CHAR_MAX is the (approximate) maximum number of non-blank
# characters to render. We pass it as an optional parameter to ease
# the tests.
lim = N_CHAR_MAX // 2 # apprx number of chars to keep on both ends
regex = r"^(\s*\S){%d}" % lim
# The regex '^(\s*\S){%d}' % n
# matches from the start of the string until the nth non-blank
# character:
# - ^ matches the start of string
# - (pattern){n} matches n repetitions of pattern
# - \s*\S matches a non-blank char following zero or more blanks
left_lim = re.match(regex, data).end()
right_lim = re.match(regex, data[::-1]).end()
if "\n" in data[left_lim:-right_lim]:
# The left side and right side aren't on the same line.
# To avoid weird cuts, e.g.:
# categoric...ore',
# we need to start the right side with an appropriate newline
# character so that it renders properly as:
# categoric...
# handle_unknown='ignore',
# so we add [^\n]*\n which matches until the next \n
regex += r"[^\n]*\n"
right_lim = re.match(regex, data[::-1]).end()
ellipsis = "..."
if left_lim + len(ellipsis) < len(data) - right_lim:
# Only add ellipsis if it results in a shorter repr
data = data[:left_lim] + "..." + data[-right_lim:]
return data
There's no need for a regular expression but you do want to use string formatting rather than the string concatenation in the accepted answer.
This is probably the most canonical, Pythonic way to truncate the string data at 75 characters.
>>> data = "saddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddsaddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddsadddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"
>>> info = "{}..".format(data[:75]) if len(data) > 75 else data
>>> info
'111111111122222222223333333333444444444455555555556666666666777777777788888...'
Here's a function I made as part of a new String class... It allows adding a suffix ( if the string is size after trimming and adding it is long enough - although you don't need to force the absolute size )
I was in the process of changing a few things around so there are some useless logic costs ( if _truncate ... for instance ) where it is no longer necessary and there is a return at the top...
But, it is still a good function for truncating data...
##
## Truncate characters of a string after _len'nth char, if necessary... If _len is less than 0, don't truncate anything... Note: If you attach a suffix, and you enable absolute max length then the suffix length is subtracted from max length... Note: If the suffix length is longer than the output then no suffix is used...
##
## Usage: Where _text = 'Testing', _width = 4
## _data = String.Truncate( _text, _width ) == Test
## _data = String.Truncate( _text, _width, '..', True ) == Te..
##
## Equivalent Alternates: Where _text = 'Testing', _width = 4
## _data = String.SubStr( _text, 0, _width ) == Test
## _data = _text[ : _width ] == Test
## _data = ( _text )[ : _width ] == Test
##
def Truncate( _text, _max_len = -1, _suffix = False, _absolute_max_len = True ):
## Length of the string we are considering for truncation
_len = len( _text )
## Whether or not we have to truncate
_truncate = ( False, True )[ _len > _max_len ]
## Note: If we don't need to truncate, there's no point in proceeding...
if ( not _truncate ):
return _text
## The suffix in string form
_suffix_str = ( '', str( _suffix ) )[ _truncate and _suffix != False ]
## The suffix length
_len_suffix = len( _suffix_str )
## Whether or not we add the suffix
_add_suffix = ( False, True )[ _truncate and _suffix != False and _max_len > _len_suffix ]
## Suffix Offset
_suffix_offset = _max_len - _len_suffix
_suffix_offset = ( _max_len, _suffix_offset )[ _add_suffix and _absolute_max_len != False and _suffix_offset > 0 ]
## The truncate point.... If not necessary, then length of string.. If necessary then the max length with or without subtracting the suffix length... Note: It may be easier ( less logic cost ) to simply add the suffix to the calculated point, then truncate - if point is negative then the suffix will be destroyed anyway.
## If we don't need to truncate, then the length is the length of the string.. If we do need to truncate, then the length depends on whether we add the suffix and offset the length of the suffix or not...
_len_truncate = ( _len, _max_len )[ _truncate ]
_len_truncate = ( _len_truncate, _max_len )[ _len_truncate <= _max_len ]
## If we add the suffix, add it... Suffix won't be added if the suffix is the same length as the text being output...
if ( _add_suffix ):
_text = _text[ 0 : _suffix_offset ] + _suffix_str + _text[ _suffix_offset: ]
## Return the text after truncating...
return _text[ : _len_truncate ]
Suppose that stryng is a string which we wish to truncate and that nchars is the number of characters desired in the output string.
stryng = "sadddddddddddddddddddddddddddddddddddddddddddddddddd"
nchars = 10
We can truncate the string as follows:
def truncate(stryng:str, nchars:int):
return (stryng[:nchars - 6] + " [...]")[:min(len(stryng), nchars)]
The results for certain test cases are shown below:
s = "sadddddddddddddddddddddddddddddd!"
s = "sa" + 30*"d" + "!"
truncate(s, 2) == sa
truncate(s, 4) == sadd
truncate(s, 10) == sadd [...]
truncate(s, len(s)//2) == sadddddddd [...]
My solution produces reasonable results for the test cases above.
However, some pathological cases are shown below:
Some Pathological Cases!
truncate(s, len(s) - 3)() == sadddddddddddddddddddddd [...]
truncate(s, len(s) - 2)() == saddddddddddddddddddddddd [...]
truncate(s, len(s) - 1)() == sadddddddddddddddddddddddd [...]
truncate(s, len(s) + 0)() == saddddddddddddddddddddddddd [...]
truncate(s, len(s) + 1)() == sadddddddddddddddddddddddddd [...
truncate(s, len(s) + 2)() == saddddddddddddddddddddddddddd [..
truncate(s, len(s) + 3)() == sadddddddddddddddddddddddddddd [.
truncate(s, len(s) + 4)() == saddddddddddddddddddddddddddddd [
truncate(s, len(s) + 5)() == sadddddddddddddddddddddddddddddd
truncate(s, len(s) + 6)() == sadddddddddddddddddddddddddddddd!
truncate(s, len(s) + 7)() == sadddddddddddddddddddddddddddddd!
truncate(s, 9999)() == sadddddddddddddddddddddddddddddd!
Notably,
When the string contains new-line characters (\n) there could be an issue.
When nchars > len(s) we should print string s without trying to print the "[...]"
Below is some more code:
import io
class truncate:
"""
Example of Code Which Uses truncate:
```
s = "\r<class\n 'builtin_function_or_method'>"
s = truncate(s, 10)()
print(s)
```
Examples of Inputs and Outputs:
truncate(s, 2)() == \r
truncate(s, 4)() == \r<c
truncate(s, 10)() == \r<c [...]
truncate(s, 20)() == \r<class\n 'bu [...]
truncate(s, 999)() == \r<class\n 'builtin_function_or_method'>
```
Other Notes:
Returns a modified copy of string input
Does not modify the original string
"""
def __init__(self, x_stryng: str, x_nchars: int) -> str:
"""
This initializer mostly exists to sanitize function inputs
"""
try:
stryng = repr("".join(str(ch) for ch in x_stryng))[1:-1]
nchars = int(str(x_nchars))
except BaseException as exc:
invalid_stryng = str(x_stryng)
invalid_stryng_truncated = repr(type(self)(invalid_stryng, 20)())
invalid_x_nchars = str(x_nchars)
invalid_x_nchars_truncated = repr(type(self)(invalid_x_nchars, 20)())
strm = io.StringIO()
print("Invalid Function Inputs", file=strm)
print(type(self).__name__, "(",
invalid_stryng_truncated,
", ",
invalid_x_nchars_truncated, ")", sep="", file=strm)
msg = strm.getvalue()
raise ValueError(msg) from None
self._stryng = stryng
self._nchars = nchars
def __call__(self) -> str:
stryng = self._stryng
nchars = self._nchars
return (stryng[:nchars - 6] + " [...]")[:min(len(stryng), nchars)]
Here's a simple function that will truncate a given string from either side:
def truncate(string, length=75, beginning=True, insert='..'):
'''Shorten the given string to the given length.
An ellipsis will be added to the section trimmed.
:Parameters:
length (int) = The maximum allowed length before trunicating.
beginning (bool) = Trim starting chars, else; ending.
insert (str) = Chars to add at the trimmed area. (default: ellipsis)
:Return:
(str)
ex. call: truncate('12345678', 4)
returns: '..5678'
'''
if len(string)>length:
if beginning: #trim starting chars.
string = insert+string[-length:]
else: #trim ending chars.
string = string[:length]+insert
return string
Here I use textwrap.shorten and handle more edge cases. also include part of the last word in case this word is more than 50% of the max width.
import textwrap
def shorten(text: str, width=30, placeholder="..."):
"""Collapse and truncate the given text to fit in the given width.
The text first has its whitespace collapsed. If it then fits in the *width*, it is returned as is.
Otherwise, as many words as possible are joined and then the placeholder is appended.
"""
if not text or not isinstance(text, str):
return str(text)
t = text.strip()
if len(t) <= width:
return t
# textwrap.shorten also throws ValueError if placeholder too large for max width
shorten_words = textwrap.shorten(t, width=width, placeholder=placeholder)
# textwrap.shorten doesn't split words, so if the text contains a long word without spaces, the result may be too short without this word.
# Here we use a different way to include the start of this word in case shorten_words is less than 50% of `width`
if len(shorten_words) - len(placeholder) < (width - len(placeholder)) * 0.5:
return t[:width - len(placeholder)].strip() + placeholder
return shorten_words
Tests:
>>> shorten("123 456", width=7, placeholder="...")
'123 456'
>>> shorten("1 23 45 678 9", width=12, placeholder="...")
'1 23 45...'
>>> shorten("1 23 45 678 9", width=10, placeholder="...")
'1 23 45...'
>>> shorten("01 23456789", width=10, placeholder="...")
'01 2345...'
>>> shorten("012 3 45678901234567", width=17, placeholder="...")
'012 3 45678901...'
>>> shorten("1 23 45 678 9", width=9, placeholder="...")
'1 23...'
>>> shorten("1 23456", width=5, placeholder="...")
'1...'
>>> shorten("123 456", width=5, placeholder="...")
'12...'
>>> shorten("123 456", width=6, placeholder="...")
'123...'
>>> shorten("12 3456789", width=9, placeholder="...")
'12 345...'
>>> shorten(" 12 3456789 ", width=9, placeholder="...")
'12 345...'
>>> shorten('123 45', width=4, placeholder="...")
'1...'
>>> shorten('123 45', width=3, placeholder="...")
'...'
>>> shorten("123456", width=3, placeholder="...")
'...'
>>> shorten([1], width=9, placeholder="...")
'[1]'
>>> shorten(None, width=5, placeholder="...")
'None'
>>> shorten("", width=9, placeholder="...")
''

Categories