How to ensure all string literals are unicode in python - python

I have a fairly large python code base to go through. It's got an issue where some string literals are strings and others are unicode. And this causes bugs. I am trying to convert everything to unicode. I was wondering if there is a tool that can convert all literals to unicode. I.e. if it found something like this:
print "result code %d" % result['code']
to:
print u"result code %d" % result[u'code']
If it helps I use PyCharm (in case there is an extension that does this), however I am would be happy to use a command like too as well. Hopefully such a tool exists.

You can use tokenize.generate_tokens break the string representation of Python code into tokens. tokenize also classifies the tokens for you. Thus you can identify string literals in Python code.
It is then not hard to manipulate the tokens, adding 'u' where desired:
import tokenize
import token
import io
import collections
class Token(collections.namedtuple('Token', 'num val start end line')):
#property
def name(self):
return token.tok_name[self.num]
def change_str_to_unicode(text):
result = text.splitlines()
# Insert a dummy line into result so indexing result
# matches tokenize's 1-based indexing
result.insert(0, '')
changes = []
for tok in tokenize.generate_tokens(io.BytesIO(text).readline):
tok = Token(*tok)
if tok.name == 'STRING' and not tok.val.startswith('u'):
changes.append(tok.start)
for linenum, s in reversed(changes):
line = result[linenum]
result[linenum] = line[:s] + 'u' + line[s:]
return '\n'.join(result[1:])
text = '''print "result code %d" % result['code']
# doesn't touch 'strings' in comments
'handles multilines' + \
'okay'
u'Unicode is not touched'
'''
print(change_str_to_unicode(text))
yields
print u"result code %d" % result[u'code']
# doesn't touch 'strings' in comments
u'handles multilines' + u'okay'
u'Unicode is not touched'

Try this (uses regex), and it's shorter than #unutbu's solution.
But there's s loop hole, the strings containing # won't work with this.
import re
scode = '''
print "'Hello World'" # prints 'Hello World'
u'Unicode is unchanged'"""
# so are "comments"'''
x1 = re.compile('''(?P<unicode>u?)(?P<c>'|")(?P<data>.*?)(?P=c)''')
def repl(m):
return "u%(c)s%(data)s%(c)s" % m.groupdict()
fcode = '\n'.join(
[re.sub(x1,repl,i)
if not '#' in i
else re.sub(x1,repl,i[:i.find('#')])+i[i.find('#'):]
for i in scode.splitlines()])
print fcode
Outputs:
print u"'Hello World'" # prints 'Hello World'
u'Unicode is unchanged'
# so are "comments"
For # I have this (and it's longer than #unutbu's solution :| )
import re
scode = '''print "'Hello World'" # prints 'Hello World'
u'Unicode is unchanged'
# so are "comments"
'#### Hi' # 'Hi' '''
x1 = re.compile('''(?P<unicode>u?)(?P<c>'|")(?P<data>.*?)(?P=c)''')
def in_string(text,index):
curr,in_l,in_str,level = '',0,False,[]
for c in text[:index+1]:
if c == '"' or c == "'":
if in_str and curr == c:
instr = False
curr = ''
in_l -= 1
else:
instr = True
curr = c
in_l += 1
level.append(in_l)
return bool(level[index])
def repl(m):
return "u%(c)s%(data)s%(c)s" % m.groupdict()
def handle_hashes(i):
if i.count('#') == 1:
n = i.find('#')
else:
n = get_hash_out_of_string(i)
return re.sub(x1,repl,i[:n]) + i[n:]
def get_hash_out_of_string(i):
n = i.find('#')
curr = i[:]
last = (len(i)-1)-''.join(list(reversed(i))).find('#')
while in_string(curr,n) and n < last:
curr = curr[:n]+' '+curr[n+1:]
n = curr.find('#')
return n
fcode = '\n'.join(
[re.sub(x1,repl,i)
if not '#' in i
else handle_hashes(i)
for i in scode.splitlines()])
print fcode
Output:
print u"'Hello World'" # prints 'Hello World'
u'Unicode is unchanged'
# so are "comments"
u'#### Hi' # 'Hi'

Related

How to inject characters into a string?

I have a function in which I need to inject a character before every odd index of a string.
def inject(s, i):
sent = str(s)
new = []
for w in sent:
if w == ' ':
new.append(w)
elif (sent.index(w) % 2 == 1):
new.append(str(i) + w)
else:
new.append(w)
return ''.join(new)
The function works for:
inject(1339, 3)
output: '1333339'
and works for
inject('This is a test', 'x')
output: 'Txhixs ixs a txexst'
but won't work for:
inject('Hello World', 'x')
output: 'Hxello World'
I know it has something to do with it breaking on the 'll' but can't figure out how to fix it.
Thanks!
I think you meant to do this:
def inject(s, i):
sent = str(s)
new = []
for idx,w in enumerate(sent):
if w == ' ':
new.append(w)
elif idx % 2 == 1:
new.append(str(i) + w)
else:
new.append(w)
return ''.join(new)
for t in [1339, 'This is a test', 'Hello World']:
print(t, inject(t,'x'))
When using the enumerate() you get the index of each character without having to search for it.
The sent.index(w) call just doesn't do what you want it to even though it might seem like it does when there are no repeated characters in a string.

Python Regex matching wrong strings

I have the following python script which does regex matching using 'AND', 'OR' features as well:
class PyBoolReException(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return str(self.value)
class PyBoolRe:
def __init__(self, boolstr):
# Require whitespace before words?
self.__needspace = True
# whitespace re
self._wspre = re.compile('^\s*$')
# create regexp string
self.__rexplist = []
oparct = boolstr.count('(')
clparct = boolstr.count(')')
if oparct != clparct:
raise PyBoolReException, 'Mismatched parantheses!'
self.__parse(boolstr)
# if NOT is one of the members, reverse
# the list
# print self.__rexplist
if '!' in self.__rexplist:
self.__rexplist.reverse()
s = self.__makerexp(self.__rexplist)
# print s
self.__rexp = re.compile(s)
def match(self, data):
""" Match the boolean expression, behaviour
is same as the 'match' method of re """
return self.__rexp.match(data)
def search(self, data):
""" Search the boolean expression, behaviour
is same as the 'search' method of re """
return self.__rexp.search(data)
def __parse(self, s):
""" Parse the boolean regular expression string
and create the regexp list """
# The string is a nested parantheses with
# any character in between the parens.
scopy = s[:]
oparmatch, clparmatch = False, False
# Look for a NOT expression
index = scopy.rfind('(')
l = []
if index != -1:
oparmatch = True
index2 = scopy.find(')', index)
if index2 != -1:
clparmatch = True
newstr = scopy[index+1:index2]
# if the string is only of whitespace chars, skip it
if not self._wspre.match(newstr):
self.__rexplist.append(newstr)
replacestr = '(' + newstr + ')'
scopy = scopy.replace(replacestr, '')
self.__parse(scopy)
if not clparmatch and not oparmatch:
if scopy: self.__rexplist.append(scopy)
def is_inbetween(self, l, elem):
""" Find out if an element is in between
in a list """
index = l.index(elem)
if index == 0:
return False
if index>2:
if index in range(1, len(l) -1):
return True
else:
return False
else:
return True
def __makenotexpr(self, s):
""" Make a NOT expression """
if s.find('!') == 0:
return ''.join(('(?!', s[1:], ')'))
else:
return s
def __makerexp(self, rexplist):
""" Make the regular expression string for
the boolean match from the nested list """
is_list = True
if type(rexplist) is str:
is_list = False
elem = rexplist
elif type(rexplist) is list:
elem = rexplist[0]
if type(elem) is list:
elem = elem[0]
eor = False
if not is_list or len(rexplist) == 1:
eor = True
word_str = '.*'
s=''
# Implementing NOT
if elem == '!':
return ''.join(('(?!', self.__makerexp(rexplist[1:]), ')'))
# Implementing OR
elif elem.find(' | ') != -1:
listofors = elem.split(' | ')
for o in listofors:
index = listofors.index(o)
in_bet = self.is_inbetween(listofors, o)
if o:
o = self.__makenotexpr(o)
if in_bet:
s = ''.join((s, '|', word_str, o, '.*'))
else:
s = ''.join((s, word_str, o, '.*'))
# Implementing AND
elif elem.find(' & ') != -1:
listofands = elem.split(' & ')
for a in listofands:
index = listofands.index(a)
in_bet = self.is_inbetween(listofands, a)
if a:
a = self.__makenotexpr(a)
s = ''.join((s, word_str, a, '.*'))
else:
if elem:
elem = self.__makenotexpr(elem)
s = ''.join((elem, '.*'))
if eor:
return s
else:
return ''.join((s, self.__makerexp(rexplist[1:])))
When the search phrase is as follows:
p = PyBoolRe('Python | Perl')
s1 = 'Guido invented Python'
s2 = 'Guido Perl'
if p.match(s1):
print 'Match found for first string'
else:
print 'No match found for first string'
if p.match(s2):
print 'Match found for second string'
else:
print 'No match found for second string'
Then both s1 & s2 match
But when the search phrase is:
p = PyBoolRe('Guido & (Python | Perl)')
s1 = 'Guido invented Python'
s2 = 'Guido Perl is great'
Then it should match if s1 or s2 has "Guido Python" or "Guido Perl". s2 has that but it does not match it. On the other hand, it matches s1, which it should not. Why is that?
Please help!! How can I get it to work??
Your generated expression is
.*Python.*|.*Perl.*.*Guido.*
while it should look like
(?=.*Guido.*)(?:.*Python.*|.*Perl.*)
So the parser needs some revision.
1) x|y should be enclosed into (?:...) (at least when used inside another block). Otherwise, | unluckily takes the global priority in the regexp.
2) x & y should be converted into (?=x)y (trailing context may be used to express the and between regular expressions)

How to write a small tokenizer in Python? [duplicate]

This question already has answers here:
Can you add new statements to Python's syntax?
(13 answers)
Mini-languages in Python
(6 answers)
Closed 6 years ago.
Normally, Python calls functions by
func(arg0, arg1)
But I would like to change to
func arg0 arg1
For example,
#Something...
cmd = input()
interpret(cmd)
#Something...
If I input 'func arg0 arg1', then I expect Python to execute func(arg0, arg1).
Args will contain string, so that we can't simply split words.
Actually, I would like to write some scripts to use on my mobile. So it would be a little annoying to type parentheses.
You can do this :
class tryClass:
def callFunction(self, arg, arg2):
print("In call")
print(arg)
print(arg2)
input = str(input())
input = input.split(" ")
funcName = input[0]
my_cls = tryClass()
method = getattr(my_cls, funcName)
method(input[1], input[2])
If I put in input callFunction hello world it works :)
if none of the args contains whitespace you could do
fn_args=cmd.split()
python_code="%s(%s)" % (fn[0], ", ".join(fn_args[1:]))
eval(python_code)
Edit:
If it is not that simple you should have a look at https://docs.python.org/3/library/cmd.html and https://docs.python.org/3/library/argparse.html but these require some preparation before you can execute arbitrary code
Edit2:
If you do not need your args to be exact python, you could parse them as json with the standard library
you could do it like
import json
cmd='fname "a" "b" 1'
fn,sep,args=cmd.strip().partition(" ")
end=0
largs=[]
d=json.JSONDecoder()
while end < len(args):
args=args[end:].strip()
arg,end=d.raw_decode(args)
largs.append(arg)
exec(fn)(*largs) # better look into exec docs
The builtin shlex module is probably what you want:
>>> import shlex
>>> cmd = "func arg0 arg1 'arg2 has spaces'"
>>> list(shlex.shlex(cmd))
['func', 'arg0', 'arg1', "'arg2 has spaces'"]
If you can trust the input, then actually calling this will look like:
>>> tokens = list(shlex.shlex(cmd))
>>> # here is a stupid func function that reverses its input args
>>> func = lambda *args: print(*reversed(args))
>>> eval(tokens[0])(*tokens[1:])
'arg2 has spaces' arg1 arg0
All I want is a simple tokenizer. And I would like to run functions by calling eval(). So that's what I did for my project.
Here's the result:
>>> tokenizer('func 123 abc')
[('func', 'func'), ('arg', '123'), ('arg', 'abc')]
>>> tokenizer('func 123.5 abc')
[('func', 'func'), ('arg', '123.5'), ('arg', 'abc')]
>>> tokenizer('func 123.5 abc "Hello, World!"')
[('func', 'func'), ('arg', '123.5'), ('arg', 'abc'), ('arg', 'Hello, World!')]
>>> tokenizer("func 123.5 abc 'Hello, World!'")
[('func', 'func'), ('arg', '123.5'), ('arg', 'abc'), ('arg', 'Hello, World!')]
Attentsion: This may not suitable for everyone, this's not a complete parser or tokenizer.
Code:
def isNumber(cmd):
try:
int(cmd)
return True
except ValueError:
try:
float(cmd)
return True
except ValueError:
return False
return False
def isWord(cmd):
if len(cmd) == 0:
return False
if cmd[0].isalpha():
for i in cmd[1:]:
if not i.isalpha() and i != '_' and i != '-':
return False
return True
return False
def spaceParser(cmd):
i = 0
for i in range(len(cmd)):
if cmd[i] == ' ':
continue
break
return cmd[i:]
def funcNameParser(cmd):
cmd = spaceParser(cmd)
i = 0
word = ''
for i in range(len(cmd)):
if cmd[i] != ' ':
word += cmd[i]
else:
break
if i + 1 > len(word):
return (word, cmd[i:])
return (word, cmd[i+1:])
def argumentParser(cmd):
cmd = spaceParser(cmd)
if cmd[0] == '\'':
word = ''
i = 0
for i in range(1, len(cmd)):
if cmd[i] != '\'':
word += cmd[i]
else:
return (word, cmd[i+1:])
assert False, 'Fatal exception: String not finished.'
if cmd[0] == '"':
word = ''
i = 0
for i in range(1, len(cmd)):
if cmd[i] != '"':
word += cmd[i]
else:
return (word, cmd[i+1:])
assert False, 'Fatal exception: String not finished.'
i = 0
word = ''
for i in range(len(cmd)):
if cmd[i] != ' ':
word += cmd[i]
else:
break
assert isWord(word) or isNumber(word), 'Fatal exception: Not a valid name.'
if i + 1 > len(word):
return (word, cmd[i:])
return (word, cmd[i+1:])
def tokenizer(cmd):
token = []
result = funcNameParser(cmd)
token += [('func', result[0])]
while len(result[1]) != 0:
result = argumentParser(result[1])
token += [('arg', result[0])]
return token

Reading a (compressed) file

My code :
sent = str(input("Please input a sentence: "))
dl = [0]
for count , v in enumerate (splitsent):
if splitsent.count(v) < 2:
dl.append(max(dl) +1)
else:
dl.append(splitsent.index(v) +1)
dl.remove(0)
print(sent, "\n",dl)
gives the output :
"1,2,3,4,1,2"
with the input:
"To be or not to be"
This is it in it's "compressed" form. How would I take the output,"1,2,3,4,1,2" from an external file and turn it into the "To be or not to be"?
Your method really not an efficient way of compressing a text file, just use the existing zlib.
But, for the academic exercise, you will want to use pickle to store your dictionary keys such that when you recover it you get the same values. As you want the 'compressed' form to exist between invocations, so that you can successfully decompress a previously 'compressed' file, you will need to allocate an index to each word.
If you want a 'standard' python method, OrderedDict from collections can be used to create an index in this way, new words are added to the end, but unlike conventional dict objects, old ones keep their position. A better method is an OrderedSet, but this is not in standard python, see this recipe.
Case
You also have to decide if 'THIS', 'this' and 'ThIs' are different words or the same word. Perhaps each word token needs a bitfield to indicate if each character is lower or upper case, e.g. 'ThIs' gets a token 15, but a bitfield of 5 "0x1010", producing a tuple of (15,5) in the compressed file.
Punctuation
You will also need to consider punctuation, where a word is thus punctuated you will need a way to represent this in the compressed form, a token for the punctuation character.
But there is a problem with this.
Then when you decompress you will need to recreate the original exactly, so handle punctuation. e.g. "Is this correct?" -> [1,2,3,4] -> "Is this correct ?" or "Is this correct?" without the space.
So for each punctuation you need to indicate how it joins to the previous and next character, e.g.
As punctuation is only ever one character (i.e. one 8 bit number), you may want to consider just putting the character as-is.
Multiple spaces
You will also need to handle multiple spaces.
Example code
This code is incomplete, mostly untested and probably does not handle all use cases, but it illustrates one possible solution to the question.
To use it, create a file called in.txt containing the text you want to compress, then run
python compdict.py -c in.txt out.comp
or
python compdict.py -d out.comp out.txt
or
python compdict.py --list
from ordered_set import OrderedSet #pip install ordered_set
import os
import cPickle as pickle
import string
import argparse
class CompDecomp(object):
__DEFAULT_PICKLE_FN__ = "my.dict"
printable_non_chars = set(string.printable) - set(string.digits) - set(string.ascii_letters)
def __init__(self, fn=None, *args, **kw):
if fn is None:
self.fn = self.__DEFAULT_PICKLE_FN__
else:
self.fn = fn
self.dict = self.loaddict()
def loaddict(self):
if os.path.exists(self.fn):
pkl = open(self.fn, "rb")
d = pickle.load(pkl)
pkl.close()
else:
d = OrderedSet()
return d
def savedict(self):
pkl = open(self.fn, "wb")
pickle.dump(self.dict, pkl)
pkl.close()
def compressword(self, word, conjoin=False):
if word.lower() not in self.dict:
self.dict.append(word.lower())
print "New word: \'%s\'" % word
self.savedict()
index, flag, _ = self.__caseflag__(word, conjoin)
#print index, bin(flag)[2:].zfill(len(word)), conjoin
return index, flag, conjoin
def decompressword(self, index, caseflag=0, conjoin=False):
if isinstance(index, int):
word = self.dict[index]
else:
word = index
if caseflag == 0:
return word, conjoin
flag = bin(caseflag)[2:].zfill(len(word))
res = ""
for n, c in enumerate(word):
if flag[n] == '1':
res += c.upper()
else:
res += c.lower()
return res, conjoin
def __caseflag__(self, word, conjoin):
index = self.dict.index(word.lower())
if word.lower() == word:
#Word is all lowercase
return (index,0, conjoin)
if word.upper() == word:
#Word is all uppercase
return index, int("1" * len(word), 2), conjoin
res = ""
for c in word:
if c in string.uppercase:
res += "1"
else:
res += "0"
return index, int(res, 2), conjoin
def compressfile(self, fileobj):
with fileobj as f:
data = f.read(-1)
f.close()
words = data.split(" ")
compress = []
for word in words:
#Handle multiple spaces
if word == "":
compress.append(" ")
continue
#Handle puntuation, treat apostrophied words as new words
substr = []
p1 = 0
csplit = word.translate(None, string.ascii_letters+'\'')
for n, c in enumerate(csplit):
subword, word = word.split(c, 1)
compress.append(self.compressword(subword, True if n > 0 else False))
compress.append((c, 0, True))
#Handle words
if len(word) and not len(csplit):
compress.append(self.compressword(word))
return compress
def decompressfile(self, fileobj):
data = pickle.load(fileobj)
decomp = ""
for v in data:
if not isinstance(v,tuple):
print "Bad data %s" % v
continue
if len(v) > 0 and len(v) <= 3:
d, conjoin = self.decompressword(*v)
if len(decomp):
decomp += "" if conjoin else " "
decomp += d
else:
print "Bad data %s (length %d)" % (v, len(v))
return decomp
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Test file compress / decompress')
group = parser.add_mutually_exclusive_group()
parser.add_argument('infile', nargs='?', default=None)
parser.add_argument('outfile', nargs='?', default=None)
group.add_argument('-compress', action='store_true')
group.add_argument('-decompress', action='store_true')
group.add_argument('--list', action='store_true')
args = parser.parse_args()
cd = CompDecomp()
#Invocation
#python dictcompress.py [-h|-c|-d|--list] [<infile>] [<outfile>]
infile, outfile = args.infile, args.outfile
if infile is not None and not os.path.exists(infile):
print "Input file missing"
if outfile is not None:
of = open(outfile, "wb")
else:
of = None
if not args.list:
if args.compress:
print "Compress"
pickle.dump(cd.compressfile(open(infile, "r")), of)
if args.decompress:
print "Decompress"
of.write(cd.decompressfile(open(infile, "r")))
else:
for k in cd.dict:
print k
if of is not None:
of.close()

regex to replace regex

I have this regex for getting strings in Python code:
x1 = re.compile('''((?P<unicode>u?)(?P<c1>'|")(?P<data>.+?)(?P<c2>'|"))''')
I want to extract the data and c1,c2 parts of this regex to make a replace string (if c1 == c2)
Something like:
repl = "u<c1><data><c2>"
How can I do this??
Is that possible in one line or by using re.sub?
UPDATE:
My new code:
x1 = re.compile('''(?P<unicode>u?)(?P<c>'|")(?P<data>.*?)(?P=c)''')
def repl(match):
if '#' in match.string:
### Confused
return "u%(c)s%(data)s%(c)s" % m.groupdict()
fcode = '\n'.join([re.sub(x1,repl,i) for i in scode.splitlines()])
Here, I am having problems to determine how to not change strings in comments, what do I have to do to ignore the comments??
Say you have a pattern:
pattern = r'''(?P<unicode>u?)(?P<c>'|")(?P<data>.*?)(?P=c)''' # did a little tweak
Match a string:
m = re.search(pattern, "print('hello')")
What you got:
>>> m.groups()
('', '"', 'hello')
>>> m.groupdict()
{'c': '"', 'unicode': '', 'data': 'hello'}
Now you can do whatever you want with these:
>>> 'u{c}{data}{c}'.format_map(m.groupdict())
'u"hello"'
Maybe you are using Python 2.x:
>>> 'u{c}{data}{c}'.format(**m.groupdict())
'u"hello"'
Or even you like old %
>>> "u%(c)s%(data)s%(c)s" % m.groupdict()
'u"hello"'
Edited:
The regex solution can't handle some situations correctly.
So I used a 2to3 hack(it's actually 3to2, and still can't solve everything):
cd /usr/lib/python3.3/lib2to3/fixes/
cp fix_unicode.py fix_unicode33.py
Edit fix_unicode33.py
-_literal_re = re.compile(r"[uU][rR]?[\'\"]")
+_literal_re = re.compile(r"[rR]?[\'\"]")
-class FixUnicode(fixer_base.BaseFix):
+class FixUnicode33(fixer_base.BaseFix):
- new.value = new.value[1:]
+ new.value = 'u' + new.value
Now 2to3 --list | grep unicode33 should output unicode33
Then you can run 2to3 -f unicode33 py3files.py.
Remember to remove fix_unicode33.py after
NOTE: In Python3 ur"string" throws SyntaxError. The logic here is simple, modify it to reach your goal.
The long code I ended up with.
x1 = re.compile('''(?P<unicode>u?)(?P<c>'|")(?P<data>.*?)(?P=c)''')
def in_string(text,index):
curr,in_l,in_str,level = '',0,False,[]
for c in text[:index+1]:
if c == '"' or c == "'":
if in_str and curr == c:
instr = False
curr = ''
in_l -= 1
else:
instr = True
curr = c
in_l += 1
level.append(in_l)
return bool(level[index])
def repl(m):
return "u%(c)s%(data)s%(c)s" % m.groupdict()
def handle_hashes(i):
if i.count('#') == 1:
n = i.find('#')
else:
n = get_hash_out_of_string(i)
return re.sub(x1,repl,i[:n]) + i[n:]
def get_hash_out_of_string(i):
n = i.find('#')
curr = i[:]
last = (len(i)-1)-''.join(list(reversed(i))).find('#')
while in_string(curr,n) and n < last:
curr = curr[:n]+' '+curr[n+1:]
n = curr.find('#')
return n

Categories