Convert string to list of bits and viceversa - python

I need to convert an ASCII string into a list of bits and vice versa:
str = "Hi" -> [0,1,0,0,1,0,0,0,0,1,1,0,1,0,0,1]
[0,1,0,0,1,0,0,0,0,1,1,0,1,0,0,1] -> "Hi"

There are many ways to do this with library functions. But I am partial to the third-party bitarray module.
>>> import bitarray
>>> ba = bitarray.bitarray()
Conversion from strings requires a bit of ceremony. Once upon a time, you could just use fromstring, but that method is now deprecated, since it has to implicitly encode the string into bytes. To avoid the inevitable encoding errors, it's better to pass a bytes object to frombytes. When starting from a string, that means you have to specify an encoding explicitly -- which is good practice anyway.
>>> ba.frombytes('Hi'.encode('utf-8'))
>>> ba
bitarray('0100100001101001')
Conversion to a list is easy. (Also, bitstring objects have a lot of list-like functions already.)
>>> l = ba.tolist()
>>> l
[False, True, False, False, True, False, False, False,
False, True, True, False, True, False, False, True]
bitstrings can be created from any iterable:
>>> bitarray.bitarray(l)
bitarray('0100100001101001')
Conversion back to bytes or strings is relatively easy too:
>>> bitarray.bitarray(l).tobytes().decode('utf-8')
'Hi'
And for the sake of sheer entertainment:
>>> def s_to_bitlist(s):
... ords = (ord(c) for c in s)
... shifts = (7, 6, 5, 4, 3, 2, 1, 0)
... return [(o >> shift) & 1 for o in ords for shift in shifts]
...
>>> def bitlist_to_chars(bl):
... bi = iter(bl)
... bytes = zip(*(bi,) * 8)
... shifts = (7, 6, 5, 4, 3, 2, 1, 0)
... for byte in bytes:
... yield chr(sum(bit << s for bit, s in zip(byte, shifts)))
...
>>> def bitlist_to_s(bl):
... return ''.join(bitlist_to_chars(bl))
...
>>> s_to_bitlist('Hi')
[0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1]
>>> bitlist_to_s(s_to_bitlist('Hi'))
'Hi'

There are probably faster ways to do this, but using no extra modules:
def tobits(s):
result = []
for c in s:
bits = bin(ord(c))[2:]
bits = '00000000'[len(bits):] + bits
result.extend([int(b) for b in bits])
return result
def frombits(bits):
chars = []
for b in range(len(bits) / 8):
byte = bits[b*8:(b+1)*8]
chars.append(chr(int(''.join([str(bit) for bit in byte]), 2)))
return ''.join(chars)

not sure why, but here are two ugly oneliners using only builtins:
s = "Hi"
l = map(int, ''.join([bin(ord(i)).lstrip('0b').rjust(8,'0') for i in s]))
s = "".join(chr(int("".join(map(str,l[i:i+8])),2)) for i in range(0,len(l),8))
yields:
>>> l
[0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1]
>>> s
'Hi'
In real world code, use the struct or the bitarray module.

You could use the built-in bytearray:
>>> for i in bytearray('Hi', 'ascii'):
... print(i)
...
72
105
>>> bytearray([72, 105]).decode('ascii')
'Hi'
And bin() to convert to binary.

def text_to_bits(text):
"""
>>> text_to_bits("Hi")
[0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1]
"""
bits = bin(int.from_bytes(text.encode(), 'big'))[2:]
return list(map(int, bits.zfill(8 * ((len(bits) + 7) // 8))))
def text_from_bits(bits):
"""
>>> text_from_bits([0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1])
'Hi'
"""
n = int(''.join(map(str, bits)), 2)
return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode()
See also, Convert Binary to ASCII and vice versa (Python).

def to_bin(string):
res = ''
for char in string:
tmp = bin(ord(char))[2:]
tmp = '%08d' %int(tmp)
res += tmp
return res
def to_str(string):
res = ''
for idx in range(len(string)/8):
tmp = chr(int(string[idx*8:(idx+1)*8], 2))
res += tmp
return res
These function is really simple.
It doesn't use third party module.

A few speed comparisons. Each of these were run using
python -m timeit "code"
or
cat <<-EOF | python -m timeit
code
EOF
if multiline.
Bits to Byte
A: 100000000 loops, best of 3: 0.00838 usec per loop
res = 0
for idx,x in enumerate([0,0,1,0,1,0,0,1]):
res |= (x << idx)
B: 100000000 loops, best of 3: 0.00838 usec per loop
int(''.join(map(str, [0,0,1,0,1,0,0,1])), 2)
Byte to Bits
A: 100000000 loops, best of 3: 0.00836 usec per loop
[(41 >> x) & 1 for x in range(7, -1, -1)]
B: 100000 loops, best of 3: 2.07 usec per loop
map(int, bin(41)[2:])

import math
class BitList:
def __init__(self, value):
if isinstance(value, str):
value = sum([bytearray(value, "utf-8")[-i - 1] << (8*i) for i in range(len(bytearray(value, "utf-8")))])
try:
self.value = sum([value[-i - 1] << i for i in range(len(value))])
except Exception:
self.value = value
def __getitem__(self, index):
if isinstance(index, slice):
if index.step != None and index.step != 1:
return list(self)[index]
else:
start = index.start if index.start else 0
stop = index.stop if index.stop != None else len(self)
return BitList(math.floor((self.value % (2 ** (len(self) - start))) >> (len(self) - stop)))
else:
return bool(self[index:index + 1].value)
def __len__(self):
return math.ceil(math.log2(self.value + 1))
def __str__(self):
return self.value
def __repr__(self):
return "BitList(" + str(self.value) + ")"
def __iter__(self):
yield from [self[i] for i in range(len(self))]
Then you can initialize BitList with a number or a list (of numbers or booleans), then you can get its value, get positional items, get slices, and convert it to a list. Note: Cannot currently set items, but when I add that I will edit this post.
I made this my self, then went looking for how to convert a string (or a file) into a list of bits, then figured that out from another answer.

This might work, but it does not work if you ask PEP 8 (long line, complex)
tobits = lambda x: "".join(map(lambda y:'00000000'[len(bin(ord(y))[2:]):]+bin(ord(y))[2:],x))
frombits = lambda x: ''.join([chr(int(str(y), 2)) for y in [x[y:y+8] for y in range(0,len(x),8)]])
These are used like normal functions.

Because I like generators, I'll post my version here:
def bits(s):
for c in s:
yield from (int(bit) for bit in bin(ord(c))[2:].zfill(8))
def from_bits(b):
for i in range(0, len(b), 8):
yield chr(int(''.join(str(bit) for bit in b[i:i + 8]), 2))
print(list(bits('Hi')))
[0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1]
print(''.join(from_bits([0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1])))
Hi

If you have bits in a list then you simply convert it into str and then to a number. Number will behave like a bit string and then bitwise operation can be applied.
For example :
int(str([1,0,0,1]) | int(str([1,0,1,1])

Related

finding possible entries in Sudoku empty cells

I have been packaging a script to calculate all possible entries in the empty cells of a sudoku game. While the algorithm to screen the vertical column and the horizontal row works, it seems that my script is not able to screen the relevant box where the empty cell is located.
The code that I am using is the following:
def possible(y,x,n):
global grid
for i in range(0,9):
if grid[y][i] == n:
return False
for i in range(0,9):
if grid[i][x] == n:
return False
x0 = (x//3)*3
y0 = (y//3)*3
for i in range(0,3):
for j in range(0,3):
if (grid[y0+i][x0+j] == n):
#print((x0+j),end=' ')
#print((y0+i),end=' ')
return False
list.append(y+1)
list.append(x+1)
list.append(n)
return True
It seems that there is some problem with the append procedure.....
Any assistance is welcome
My general comments:
Seems like you are trying to append to a list which might or might
not be defined outside of the possible() function (it's not in the supplied code).
However, as it is
not defined within the scope of that function, you generally can't
access it from the inside. Related read.)
Also you should change a variable name as
list is a built-in type of Python and it is not
recommended to use builtin types as variable
names unless you absolutely need to do so for some reason.
Generally it is not a best practice to use global variables.
My suggestion would be to move the gathering of possible
numbers outside of this function. Example:
def possible(grid: list[list[int]], num: int, pos: tuple[int, int]) -> bool:
# Check row
if num in grid[pos[0]]:
return False
# Check column
if num in [item[pos[1]] for item in grid]:
return False
# Check box
box_x = pos[1] // 3
box_y = pos[0] // 3
for i in range(box_y * 3, box_y * 3 + 3):
if num in grid[i][box_x * 3: box_x * 3 + 3] \
and (i, grid[i].index(num)) != pos:
return False
return True
Then run this 'cleaner' function in a for loop or list comprehension to collect possible numbers for a given position on the Sudoku grid.
For example (for cycle):
possible_values = []
for i in range(1,10):
if possible(grid, i, (x, y)):
possible_values.append(i)
Or this (list comprehension):
possible_values = [n for n in range(1,10) if possible(grid, n, (0, 2))]
In the Sudoku game world one call the "possible entries" of the empty cells, candidates or pencil-marks.
Here is how we can identify the candidates of the empty cells.
grid = [
[0, 0, 0, 6, 0, 8, 9, 1, 0],
[6, 0, 2, 0, 9, 0, 3, 4, 0],
[1, 9, 8, 3, 0, 0, 0, 6, 7],
[0, 5, 9, 0, 0, 0, 4, 2, 3],
[4, 0, 0, 8, 0, 3, 0, 0, 1],
[7, 1, 3, 0, 2, 0, 8, 0, 0],
[9, 6, 0, 5, 3, 7, 2, 8, 0],
[2, 0, 0, 4, 1, 0, 0, 3, 0],
[3, 4, 0, 2, 8, 0, 1, 7, 9],
]
candidates=bytearray(729)
def identifyCandidates():
for cell in range(81):
row,col=divmod(cell,9)
if grid[row][col] == 0 :
for kan in range(1,10):
if not clueInSector(cell, kan):
candidates[9*cell+(kan - 1)]=1
def clueInSector(cell, clue):
cellrow,cellcol = divmod(cell,9)
for col in range(9):
if (col != cellcol and grid[cellrow][col] == clue) :
return True
for row in range(9):
if (row != cellrow and grid[row][cellcol] == clue) :
return True
rowB = 3 * ((cellrow) // 3)
colB = 3 * ((cellcol) // 3)
for row in range (rowB,rowB + 3) :
for col in range(colB,colB +3) :
if (col != cellcol and row != cellrow and grid[row][col] == clue) :
return True
return False
Print of the 13 first cells:
cell:0 candidates: 5.
cell:1 candidates: 3.7.
cell:2 candidates: 4.5.7.
cell:3 no candidate.
cell:4 candidates: 4.5.7.
cell:5 no candidate.
cell:6 no candidate.
cell:7 no candidate.
cell:8 candidates: 2.5.
cell:9 no candidate.
cell:10 candidates: 7.
cell:11 no candidate.
cell:12 candidates: 1.7.

Google foo.bar failing all test cases but working in python IDE

So I'm doing the foo.bar challenge, and I've got code in python that outputs the required answers. I know for a fact that for at least the first two test cases my output matches their output but it still fails all of them. I assumed it could be because its running in python 2.7.13 so I found an online sandbox that runs that version of python but my code still outputs the required output there too. I've tried using the print function to output the results, I've tried formatting the results as lists and arrays but none of this worked. The question is below:
Doomsday Fuel
Making fuel for the LAMBCHOP's reactor core is a tricky process
because of the exotic matter involved. It starts as raw ore, then
during processing, begins randomly changing between forms, eventually
reaching a stable form. There may be multiple stable forms that a
sample could ultimately reach, not all of which are useful as fuel.
Commander Lambda has tasked you to help the scientists increase fuel
creation efficiency by predicting the end state of a given ore sample.
You have carefully studied the different structures that the ore can
take and which transitions it undergoes. It appears that, while
random, the probability of each structure transforming is fixed. That
is, each time the ore is in 1 state, it has the same probabilities of
entering the next state (which might be the same state). You have
recorded the observed transitions in a matrix. The others in the lab
have hypothesized more exotic forms that the ore can become, but you
haven't seen all of them.
Write a function solution(m) that takes an array of array of
nonnegative ints representing how many times that state has gone to
the next state and return an array of ints for each terminal state
giving the exact probabilities of each terminal state, represented as
the numerator for each state, then the denominator for all of them at
the end and in simplest form. The matrix is at most 10 by 10. It is
guaranteed that no matter which state the ore is in, there is a path
from that state to a terminal state. That is, the processing will
always eventually end in a stable state. The ore starts in state 0.
The denominator will fit within a signed 32-bit integer during the
calculation, as long as the fraction is simplified regularly.
For example, consider the matrix m: [ [0,1,0,0,0,1], # s0, the
initial state, goes to s1 and s5 with equal probability
[4,0,0,3,2,0], # s1 can become s0, s3, or s4, but with different
probabilities [0,0,0,0,0,0], # s2 is terminal, and unreachable
(never observed in practice) [0,0,0,0,0,0], # s3 is terminal
[0,0,0,0,0,0], # s4 is terminal [0,0,0,0,0,0], # s5 is terminal ]
So, we can consider different paths to terminal states, such as: s0 ->
s1 -> s3 s0 -> s1 -> s0 -> s1 -> s0 -> s1 -> s4 s0 -> s1 -> s0 -> s5
Tracing the probabilities of each, we find that s2 has probability 0
s3 has probability 3/14 s4 has probability 1/7 s5 has probability 9/14
So, putting that together, and making a common denominator, gives an
answer in the form of [s2.numerator, s3.numerator, s4.numerator,
s5.numerator, denominator] which is [0, 3, 2, 9, 14].
Languages
To provide a Java solution, edit Solution.java To provide a Python
solution, edit solution.py
Test cases
========== Your code should pass the following test cases. Note that it may also be run against hidden test cases not shown here.
-- Java cases -- Input: Solution.solution({{0, 2, 1, 0, 0}, {0, 0, 0, 3, 4}, {0, 0, 0, 0, 0}, {0, 0, 0, 0,0}, {0, 0, 0, 0, 0}}) Output:
[7, 6, 8, 21]
Input: Solution.solution({{0, 1, 0, 0, 0, 1}, {4, 0, 0, 3, 2, 0}, {0,
0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0,
0, 0}}) Output:
[0, 3, 2, 9, 14]
-- Python cases -- Input: solution.solution([[0, 2, 1, 0, 0], [0, 0, 0, 3, 4], [0, 0, 0, 0, 0], [0, 0, 0, 0,0], [0, 0, 0, 0, 0]]) Output:
[7, 6, 8, 21]
Input: solution.solution([[0, 1, 0, 0, 0, 1], [4, 0, 0, 3, 2, 0], [0,
0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0,
0, 0]]) Output:
[0, 3, 2, 9, 14]
my code is below:
import numpy as np
from fractions import Fraction
from math import gcd
def solution(M):
height = (len(M))
length = (len(M[0]))
M = np.array(M)
AB = []
#Find B
for i in range(0, height):
#if B = 1
if (sum(M[:,0])) == 0:
sumB = 1
if(M[i,0]) != 0:
B1 = Fraction((M[i,0]), (sum(M[i])))
B2 = Fraction((M[0,i]), (sum(M[0])))
B = B1 * B2
#Find sum(B) to infinity
sumB = (1/(1-B))
#Find A
boolean2 = 0
count = 0
index = []
for i in range (0, height):
if sum(M[i]) == 0:
if boolean2 == 0:
terminalstart = i
boolean = 0
boolean2 = 1
for j in range(0, height):
#if there is no A
if j==height-1 and boolean == 0:
index.append(i-terminalstart)
count +=1
if (M[j,i]) != 0:
boolean = 1
A1 = Fraction((M[j,i]), (sum(M[j])))
A = A1
if j!=0:
A2 = Fraction((M[0,j]), (sum(M[0])))
A = A1 * A2
#Find AB
AB.append(A*sumB)
#Find common denominators
x = []
y = []
for i in range (0,len(AB)):
x.append(AB[i].denominator)
lcm = 1
#change numerators to fit
for i in x:
lcm = lcm*i//gcd(lcm, i)
for i in range (0, len(AB)):
z = (lcm) / x[i]
#
z = float(z)
#
y.append(int((AB[i].numerator)*z))
#insert 0s
for i in range (0, count):
y.insert(index[i], 0)
#insert denominator
y.append(lcm)
return y
So the code and the questions are basically irrelevant, the main point is, my output (y) is exactly the same as the output in the examples, but when it runs in foo.bar it fails. To test it I used a code that simply returned the desired output in foo.bar and it worked for the test case that had this output:
def solution(M):
y = [0, 3, 2, 9, 14]
return y
So I know that since my code gets to the exact same array and data type for y in the python IDE it should work in google foo.bar, but for some reason its not. Any help would be greatly appreciated
edit:
I found a code online that works:
import numpy as np
# Returns indexes of active & terminal states
def detect_states(matrix):
active, terminal = [], []
for rowN, row in enumerate(matrix):
(active if sum(row) else terminal).append(rowN)
return(active,terminal)
# Convert elements of array in simplest form
def simplest_form(B):
B = B.round().astype(int).A1 # np.matrix --> np.array
gcd = np.gcd.reduce(B)
B = np.append(B, B.sum()) # append the common denom
return (B / gcd).astype(int)
# Finds solution by calculating Absorbing probabilities
def solution(m):
active, terminal = detect_states(m)
if 0 in terminal: # special case when s0 is terminal
return [1] + [0]*len(terminal[1:]) + [1]
m = np.matrix(m, dtype=float)[active, :] # list --> np.matrix (active states only)
comm_denom = np.prod(m.sum(1)) # product of sum of all active rows (used later)
P = m / m.sum(1) # divide by sum of row to convert to probability matrix
Q, R = P[:, active], P[:, terminal] # separate Q & R
I = np.identity(len(Q))
N = (I - Q) ** (-1) # calc fundamental matrix
B = N[0] * R * comm_denom / np.linalg.det(N) # get absorbing probs & get them close to some integer
return simplest_form(B)
When I compared the final answer from this working code to mine by adding the lines:
print(simplest_form(B))
print(type(simplest_form(B))
this is what I got
[ 0 3 2 9 14]
<class 'numpy.ndarray'>
array([ 0, 3, 2, 9, 14])
When I added the lines
y = np.asarray(y)
print(y)
print(type(y))
to my code this is what I got:
[ 0 3 2 9 14]
<class 'numpy.ndarray'>
array([ 0, 3, 2, 9, 14])
when they were both running the same test input. These are the exact same but for some reason mine doesn't work on foo.bar but his does. Am I missing something?
It turns out the
math.gcd(x, y)
function is not allowed in python 2. I just rewrote it as this:
def grcd(x, y):
if x >= y:
big = x
small = y
else:
big = y
small = x
bool1 = 1
for i in range(1, big+1):
while bool1 == 1:
if big % small == 0:
greatest = small
bool1 = 0
small-= 1
return greatest

i have a python list, using map function is omitting the first zero of the list

I have this code in python, when I print the last line, it is giving an output "11100101100". I'm expecting the output,"011100101100". Notice that the output starts with 1 and not 0. although the variable gamma_sum_list is a list containing 12 digits and its starts with 0. The function somehow deletes the first zero automatically. The following is the exact gamma_sum_list:
def convert(list)
res = int("".join(map(str,list)))
return res
print(convert(gamma_sum_list))
Input:
[0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0]
Expected Output:
011100101100
Actual Output :
11100101100
Your issue is caused by converting the result of the join operation to an integer. Integers do not have leading zeroes. If you remove the int function you'll get a string with the leading zero you're after.
gamma_sum_list = [0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0]
def convert(my_list):
res = "".join(map(str,my_list))
return res
print(convert(gamma_sum_list))
Output:
011100101100
def convert(some_list):
res = "".join(map(str,some_list))
return res
gamma_sum_list = [0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0]
print(convert(gamma_sum_list))
or
conv = lambda x: ''.join(map(str, x))
print(conv(gamma_sum_list))
Consider that:
>>> "".join(list(map(str, [0, 1])))
'01'
How would you convert '01' to an integer? Well, its just 1.
>>> int("".join(list(map(str, [0, 1]))))
1
So you probably want to not convert the string to an int, just keep it as a str.

Unicode as String without conversion Python

I'm trying to convert unicode text to string literally, but I don't seem to find a way to do this.
input= u'/123/123/123'
convert to string:
output="/123/123/123"
If I try to do str(), it will encode it and if I try to loop over the text and convert letter by letter, it will give me each one of the unicode characters.
EDIT: Take into consideration that the objective is not to convert the string but to take the letters in the unicode text and create a string. If I follow the link provided in the comment:
Convert a Unicode string to a string in Python (containing extra symbols)
import unicodedata
unicodedata.normalize('NFKD', input).encode('ascii','ignore')
output='SSS'
and as it is possible to see..it is not the expected output.
Edit: I wrote as an example the unicode u'/123' but Im trying to convert chinese characters, example:
a=u'\u6c34'
str(a)
UnicodeEncodeError: 'ascii' codec can't encode character u'\u6c34' in position 0: ordinal not in range(128)
output_expected="\u6c34"
I've tried to convert it with str() as you mention in your question, and it does work for me. You can check the encoding with type().
>>> input= u'/123/123/123'
>>> type(input)
<type 'unicode'>
>>> output=str(input)
>>> print output
/123/123/123
>>> type(output)
<type 'str'>
How do you try to iterate among the letters? I've tried and they are still as a string. You could convert the input first and then do whatever you want once they are str:
letters = [x for x in output]
for letter in letters:
... print type(letter)
...
I hope it helps!
Here's how to do it the easy way:
>>> a=u'\x83\u6c34\U00103ABC'
>>> a.encode('unicode_escape')
'\\x83\\u6c34\\U00103abc'
>>> print a.encode('unicode_escape')
\x83\u6c34\U00103abc
Here's how to do it the hard way.
ascii_printable = set(unichr(i) for i in range(0x20, 0x7f))
def convert(ch):
if ch in ascii_printable:
return ch
ix = ord(ch)
if ix < 0x100:
return '\\x%02x' % ix
elif ix < 0x10000:
return '\\u%04x' % ix
return '\\U%08x' % ix
output = ''.join(convert(ch) for ch in input)
For Python 3 use chr instead of unichr.
Somebody wrote a really complete code for doing this, so cool, sources:
import unicodedata
def fix_bad_unicode(text):
if not isinstance(text, unicode):
raise TypeError("This isn't even decoded into Unicode yet. "
"Decode it first.")
if len(text) == 0:
return text
maxord = max(ord(char) for char in text)
tried_fixing = []
if maxord < 128:
# Hooray! It's ASCII!
return text
else:
attempts = [(text, text_badness(text) + len(text))]
if maxord < 256:
tried_fixing = reinterpret_latin1_as_utf8(text)
tried_fixing2 = reinterpret_latin1_as_windows1252(text)
attempts.append((tried_fixing, text_cost(tried_fixing)))
attempts.append((tried_fixing2, text_cost(tried_fixing2)))
elif all(ord(char) in WINDOWS_1252_CODEPOINTS for char in text):
tried_fixing = reinterpret_windows1252_as_utf8(text)
attempts.append((tried_fixing, text_cost(tried_fixing)))
else:
# We can't imagine how this would be anything but valid text.
return text
# Sort the results by badness
attempts.sort(key=lambda x: x[1])
#print attempts
goodtext = attempts[0][0]
if goodtext == text:
return goodtext
else:
return fix_bad_unicode(goodtext)
def reinterpret_latin1_as_utf8(wrongtext):
newbytes = wrongtext.encode('latin-1', 'replace')
return newbytes.decode('utf-8', 'replace')
def reinterpret_windows1252_as_utf8(wrongtext):
altered_bytes = []
for char in wrongtext:
if ord(char) in WINDOWS_1252_GREMLINS:
altered_bytes.append(char.encode('WINDOWS_1252'))
else:
altered_bytes.append(char.encode('latin-1', 'replace'))
return ''.join(altered_bytes).decode('utf-8', 'replace')
def reinterpret_latin1_as_windows1252(wrongtext):
return wrongtext.encode('latin-1').decode('WINDOWS_1252', 'replace')
def text_badness(text):
assert isinstance(text, unicode)
errors = 0
very_weird_things = 0
weird_things = 0
prev_letter_script = None
for pos in xrange(len(text)):
char = text[pos]
index = ord(char)
if index < 256:
weird_things += SINGLE_BYTE_WEIRDNESS[index]
if SINGLE_BYTE_LETTERS[index]:
prev_letter_script = 'latin'
else:
prev_letter_script = None
else:
category = unicodedata.category(char)
if category == 'Co':
# Unassigned or private use
errors += 1
elif index == 0xfffd:
# Replacement character
errors += 1
elif index in WINDOWS_1252_GREMLINS:
lowchar = char.encode('WINDOWS_1252').decode('latin-1')
weird_things += SINGLE_BYTE_WEIRDNESS[ord(lowchar)] - 0.5
if category.startswith('L'):
name = unicodedata.name(char)
scriptname = name.split()[0]
freq, script = SCRIPT_TABLE.get(scriptname, (0, 'other'))
if prev_letter_script:
if script != prev_letter_script:
very_weird_things += 1
if freq == 1:
weird_things += 2
elif freq == 0:
very_weird_things += 1
prev_letter_script = script
else:
prev_letter_script = None
return 100 * errors + 10 * very_weird_things + weird_things
def text_cost(text):
"""
Assign a cost function to the length plus weirdness of a text string.
"""
return text_badness(text) + len(text)
WINDOWS_1252_GREMLINS = [
# adapted from http://effbot.org/zone/unicode-gremlins.htm
0x0152, # LATIN CAPITAL LIGATURE OE
0x0153, # LATIN SMALL LIGATURE OE
0x0160, # LATIN CAPITAL LETTER S WITH CARON
0x0161, # LATIN SMALL LETTER S WITH CARON
0x0178, # LATIN CAPITAL LETTER Y WITH DIAERESIS
0x017E, # LATIN SMALL LETTER Z WITH CARON
0x017D, # LATIN CAPITAL LETTER Z WITH CARON
0x0192, # LATIN SMALL LETTER F WITH HOOK
0x02C6, # MODIFIER LETTER CIRCUMFLEX ACCENT
0x02DC, # SMALL TILDE
0x2013, # EN DASH
0x2014, # EM DASH
0x201A, # SINGLE LOW-9 QUOTATION MARK
0x201C, # LEFT DOUBLE QUOTATION MARK
0x201D, # RIGHT DOUBLE QUOTATION MARK
0x201E, # DOUBLE LOW-9 QUOTATION MARK
0x2018, # LEFT SINGLE QUOTATION MARK
0x2019, # RIGHT SINGLE QUOTATION MARK
0x2020, # DAGGER
0x2021, # DOUBLE DAGGER
0x2022, # BULLET
0x2026, # HORIZONTAL ELLIPSIS
0x2030, # PER MILLE SIGN
0x2039, # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
0x203A, # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
0x20AC, # EURO SIGN
0x2122, # TRADE MARK SIGN
]
# a list of Unicode characters that might appear in Windows-1252 text
WINDOWS_1252_CODEPOINTS = range(256) + WINDOWS_1252_GREMLINS
# Rank the characters typically represented by a single byte -- that is, in
# Latin-1 or Windows-1252 -- by how weird it would be to see them in running
# text.
#
# 0 = not weird at all
# 1 = rare punctuation or rare letter that someone could certainly
# have a good reason to use. All Windows-1252 gremlins are at least
# weirdness 1.
# 2 = things that probably don't appear next to letters or other
# symbols, such as math or currency symbols
# 3 = obscure symbols that nobody would go out of their way to use
# (includes symbols that were replaced in ISO-8859-15)
# 4 = why would you use this?
# 5 = unprintable control character
#
# The Portuguese letter à (0xc3) is marked as weird because it would usually
# appear in the middle of a word in actual Portuguese, and meanwhile it
# appears in the mis-encodings of many common characters.
SINGLE_BYTE_WEIRDNESS = (
# 0 1 2 3 4 5 6 7 8 9 a b c d e f
5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 5, 5, 5, 5, 5, # 0x00
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, # 0x10
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x20
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x30
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x40
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x50
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x60
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, # 0x70
2, 5, 1, 4, 1, 1, 3, 3, 4, 3, 1, 1, 1, 5, 1, 5, # 0x80
5, 1, 1, 1, 1, 3, 1, 1, 4, 1, 1, 1, 1, 5, 1, 1, # 0x90
1, 0, 2, 2, 3, 2, 4, 2, 4, 2, 2, 0, 3, 1, 1, 4, # 0xa0
2, 2, 3, 3, 4, 3, 3, 2, 4, 4, 4, 0, 3, 3, 3, 0, # 0xb0
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xc0
1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, # 0xd0
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xe0
1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, # 0xf0
)
# Pre-cache the Unicode data saying which of these first 256 characters are
# letters. We'll need it often.
SINGLE_BYTE_LETTERS = [
unicodedata.category(unichr(i)).startswith('L')
for i in xrange(256)
]
# A table telling us how to interpret the first word of a letter's Unicode
# name. The number indicates how frequently we expect this script to be used
# on computers. Many scripts not included here are assumed to have a frequency
# of "0" -- if you're going to write in Linear B using Unicode, you're
# probably aware enough of encoding issues to get it right.
#
# The lowercase name is a general category -- for example, Han characters and
# Hiragana characters are very frequently adjacent in Japanese, so they all go
# into category 'cjk'. Letters of different categories are assumed not to
# appear next to each other often.
SCRIPT_TABLE = {
'LATIN': (3, 'latin'),
'CJK': (2, 'cjk'),
'ARABIC': (2, 'arabic'),
'CYRILLIC': (2, 'cyrillic'),
'GREEK': (2, 'greek'),
'HEBREW': (2, 'hebrew'),
'KATAKANA': (2, 'cjk'),
'HIRAGANA': (2, 'cjk'),
'HIRAGANA-KATAKANA': (2, 'cjk'),
'HANGUL': (2, 'cjk'),
'DEVANAGARI': (2, 'devanagari'),
'THAI': (2, 'thai'),
'FULLWIDTH': (2, 'cjk'),
'MODIFIER': (2, None),
'HALFWIDTH': (1, 'cjk'),
'BENGALI': (1, 'bengali'),
'LAO': (1, 'lao'),
'KHMER': (1, 'khmer'),
'TELUGU': (1, 'telugu'),
'MALAYALAM': (1, 'malayalam'),
'SINHALA': (1, 'sinhala'),
'TAMIL': (1, 'tamil'),
'GEORGIAN': (1, 'georgian'),
'ARMENIAN': (1, 'armenian'),
'KANNADA': (1, 'kannada'), # mostly used for looks of disapproval
'MASCULINE': (1, 'latin'),
'FEMININE': (1, 'latin')
}
Then you just call the method:
fix_bad_unicode(u'aあä')
>> u'a\u3042\xe4'

counting up and then down a range in python

I am trying to program a standard snake draft, where team A pick, team B, team C, team C, team B, team A, ad nauseum.
If pick number 13 (or pick number x) just happened how can I figure which team picks next for n number of teams.
I have something like:
def slot(n,x):
direction = 'down' if (int(x/n) & 1) else 'up'
spot = (x % n) + 1
slot = spot if direction == 'up' else ((n+1) - spot)
return slot
I have feeling there is a simpler, more pythonic what than this solution. Anyone care to take a hack at it?
So I played around a little more. I am looking for the return of a single value, rather than the best way to count over a looped list. The most literal answer might be:
def slot(n, x): # 0.15757 sec for 100,000x
number_range = range(1, n+1) + range(n,0, -1)
index = x % (n*2)
return number_range[index]
This creates a list [1,2,3,4,4,3,2,1], figures out the index (e.g. 13 % (4*2) = 5), and then returns the index value from the list (e.g. 4). The longer the list, the slower the function.
We can use some logic to cut the list making in half. If we are counting up (i.e. (int(x/n) & 1) returns False), we get the obvious index value (x % n), else we subtract that value from n+1:
def slot(n, x): # 0.11982 sec for 100,000x
number_range = range(1, n+1) + range(n,0, -1)
index = ((n-1) - (x % n)) if (int(x/n) & 1) else (x % n)
return number_range[index]
Still avoiding a list altogether is fastest:
def slot(n, x): # 0.07275 sec for 100,000x
spot = (x % n) + 1
slot = ((n+1) - spot) if (int(x/n) & 1) else spot
return slot
And if I hold the list as variable rather than spawning one:
number_list = [1,2,3,4,5,6,7,8,9,10,11,12,12,11,10,9,8,7,6,5,4,3,2,1]
def slot(n, x): # 0.03638 sec for 100,000x
return number_list[x % (n*2)]
Why not use itertools cycle function:
from itertools import cycle
li = range(1, n+1) + range(n, 0, -1) # e.g. [1, 2, 3, 4, 4, 3, 2, 1]
it = cycle(li)
[next(it) for _ in xrange(10)] # [1, 2, 3, 4, 4, 3, 2, 1, 1, 2]
Note: previously I had answered how to run up and down, as follows:
it = cycle(range(1, n+1) + range(n, 0, -1)) #e.g. [1, 2, 3, 4, 3, 2, 1, 2, 3, ...]
Here's a generator that will fulfill what you want.
def draft(n):
while True:
for i in xrange(1,n+1):
yield i
for i in xrange(n,0,-1):
yield i
>>> d = draft(3)
>>> [d.next() for _ in xrange(12)]
[1, 2, 3, 3, 2, 1, 1, 2, 3, 3, 2, 1]
from itertools import chain, cycle
def cycle_up_and_down(first, last):
up = xrange(first, last+1, 1)
down = xrange(last, first-1, -1)
return cycle(chain(up, down))
turns = cycle_up_and_down(1, 4)
print [next(turns) for n in xrange(10)] # [1, 2, 3, 4, 4, 3, 2, 1, 1, 2]
Here is a list of numbers that counts up, then down:
>>> [ -abs(5-i)+5 for i in range(0,10) ]
[0, 1, 2, 3, 4, 5, 4, 3, 2, 1]
Written out:
count_up_to = 5
for i in range( 0, count_up_to*2 ):
the_number_you_care_about = -abs(count_up_to-i) + count_up_to
# do stuff with the_number_you_care_about
Easier to read:
>>> list( range(0,5) ) + list( range( 5, 0, -1 ) )
[0, 1, 2, 3, 4, 5, 4, 3, 2, 1]
Written out:
count_up_to = 5
for i in list( range(0,5) ) + list( range(5, 0, -1) ):
# i is the number you care about
Another way:
from itertools import chain
for i in chain( range(0,5), range(5,0,-1) ):
# i is the number you care about

Categories