How to write a small tokenizer in Python? [duplicate] - python

This question already has answers here:
Can you add new statements to Python's syntax?
(13 answers)
Mini-languages in Python
(6 answers)
Closed 6 years ago.
Normally, Python calls functions by
func(arg0, arg1)
But I would like to change to
func arg0 arg1
For example,
#Something...
cmd = input()
interpret(cmd)
#Something...
If I input 'func arg0 arg1', then I expect Python to execute func(arg0, arg1).
Args will contain string, so that we can't simply split words.
Actually, I would like to write some scripts to use on my mobile. So it would be a little annoying to type parentheses.

You can do this :
class tryClass:
def callFunction(self, arg, arg2):
print("In call")
print(arg)
print(arg2)
input = str(input())
input = input.split(" ")
funcName = input[0]
my_cls = tryClass()
method = getattr(my_cls, funcName)
method(input[1], input[2])
If I put in input callFunction hello world it works :)

if none of the args contains whitespace you could do
fn_args=cmd.split()
python_code="%s(%s)" % (fn[0], ", ".join(fn_args[1:]))
eval(python_code)
Edit:
If it is not that simple you should have a look at https://docs.python.org/3/library/cmd.html and https://docs.python.org/3/library/argparse.html but these require some preparation before you can execute arbitrary code
Edit2:
If you do not need your args to be exact python, you could parse them as json with the standard library
you could do it like
import json
cmd='fname "a" "b" 1'
fn,sep,args=cmd.strip().partition(" ")
end=0
largs=[]
d=json.JSONDecoder()
while end < len(args):
args=args[end:].strip()
arg,end=d.raw_decode(args)
largs.append(arg)
exec(fn)(*largs) # better look into exec docs

The builtin shlex module is probably what you want:
>>> import shlex
>>> cmd = "func arg0 arg1 'arg2 has spaces'"
>>> list(shlex.shlex(cmd))
['func', 'arg0', 'arg1', "'arg2 has spaces'"]
If you can trust the input, then actually calling this will look like:
>>> tokens = list(shlex.shlex(cmd))
>>> # here is a stupid func function that reverses its input args
>>> func = lambda *args: print(*reversed(args))
>>> eval(tokens[0])(*tokens[1:])
'arg2 has spaces' arg1 arg0

All I want is a simple tokenizer. And I would like to run functions by calling eval(). So that's what I did for my project.
Here's the result:
>>> tokenizer('func 123 abc')
[('func', 'func'), ('arg', '123'), ('arg', 'abc')]
>>> tokenizer('func 123.5 abc')
[('func', 'func'), ('arg', '123.5'), ('arg', 'abc')]
>>> tokenizer('func 123.5 abc "Hello, World!"')
[('func', 'func'), ('arg', '123.5'), ('arg', 'abc'), ('arg', 'Hello, World!')]
>>> tokenizer("func 123.5 abc 'Hello, World!'")
[('func', 'func'), ('arg', '123.5'), ('arg', 'abc'), ('arg', 'Hello, World!')]
Attentsion: This may not suitable for everyone, this's not a complete parser or tokenizer.
Code:
def isNumber(cmd):
try:
int(cmd)
return True
except ValueError:
try:
float(cmd)
return True
except ValueError:
return False
return False
def isWord(cmd):
if len(cmd) == 0:
return False
if cmd[0].isalpha():
for i in cmd[1:]:
if not i.isalpha() and i != '_' and i != '-':
return False
return True
return False
def spaceParser(cmd):
i = 0
for i in range(len(cmd)):
if cmd[i] == ' ':
continue
break
return cmd[i:]
def funcNameParser(cmd):
cmd = spaceParser(cmd)
i = 0
word = ''
for i in range(len(cmd)):
if cmd[i] != ' ':
word += cmd[i]
else:
break
if i + 1 > len(word):
return (word, cmd[i:])
return (word, cmd[i+1:])
def argumentParser(cmd):
cmd = spaceParser(cmd)
if cmd[0] == '\'':
word = ''
i = 0
for i in range(1, len(cmd)):
if cmd[i] != '\'':
word += cmd[i]
else:
return (word, cmd[i+1:])
assert False, 'Fatal exception: String not finished.'
if cmd[0] == '"':
word = ''
i = 0
for i in range(1, len(cmd)):
if cmd[i] != '"':
word += cmd[i]
else:
return (word, cmd[i+1:])
assert False, 'Fatal exception: String not finished.'
i = 0
word = ''
for i in range(len(cmd)):
if cmd[i] != ' ':
word += cmd[i]
else:
break
assert isWord(word) or isNumber(word), 'Fatal exception: Not a valid name.'
if i + 1 > len(word):
return (word, cmd[i:])
return (word, cmd[i+1:])
def tokenizer(cmd):
token = []
result = funcNameParser(cmd)
token += [('func', result[0])]
while len(result[1]) != 0:
result = argumentParser(result[1])
token += [('arg', result[0])]
return token

Related

How to inject characters into a string?

I have a function in which I need to inject a character before every odd index of a string.
def inject(s, i):
sent = str(s)
new = []
for w in sent:
if w == ' ':
new.append(w)
elif (sent.index(w) % 2 == 1):
new.append(str(i) + w)
else:
new.append(w)
return ''.join(new)
The function works for:
inject(1339, 3)
output: '1333339'
and works for
inject('This is a test', 'x')
output: 'Txhixs ixs a txexst'
but won't work for:
inject('Hello World', 'x')
output: 'Hxello World'
I know it has something to do with it breaking on the 'll' but can't figure out how to fix it.
Thanks!
I think you meant to do this:
def inject(s, i):
sent = str(s)
new = []
for idx,w in enumerate(sent):
if w == ' ':
new.append(w)
elif idx % 2 == 1:
new.append(str(i) + w)
else:
new.append(w)
return ''.join(new)
for t in [1339, 'This is a test', 'Hello World']:
print(t, inject(t,'x'))
When using the enumerate() you get the index of each character without having to search for it.
The sent.index(w) call just doesn't do what you want it to even though it might seem like it does when there are no repeated characters in a string.

Python Regex matching wrong strings

I have the following python script which does regex matching using 'AND', 'OR' features as well:
class PyBoolReException(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return str(self.value)
class PyBoolRe:
def __init__(self, boolstr):
# Require whitespace before words?
self.__needspace = True
# whitespace re
self._wspre = re.compile('^\s*$')
# create regexp string
self.__rexplist = []
oparct = boolstr.count('(')
clparct = boolstr.count(')')
if oparct != clparct:
raise PyBoolReException, 'Mismatched parantheses!'
self.__parse(boolstr)
# if NOT is one of the members, reverse
# the list
# print self.__rexplist
if '!' in self.__rexplist:
self.__rexplist.reverse()
s = self.__makerexp(self.__rexplist)
# print s
self.__rexp = re.compile(s)
def match(self, data):
""" Match the boolean expression, behaviour
is same as the 'match' method of re """
return self.__rexp.match(data)
def search(self, data):
""" Search the boolean expression, behaviour
is same as the 'search' method of re """
return self.__rexp.search(data)
def __parse(self, s):
""" Parse the boolean regular expression string
and create the regexp list """
# The string is a nested parantheses with
# any character in between the parens.
scopy = s[:]
oparmatch, clparmatch = False, False
# Look for a NOT expression
index = scopy.rfind('(')
l = []
if index != -1:
oparmatch = True
index2 = scopy.find(')', index)
if index2 != -1:
clparmatch = True
newstr = scopy[index+1:index2]
# if the string is only of whitespace chars, skip it
if not self._wspre.match(newstr):
self.__rexplist.append(newstr)
replacestr = '(' + newstr + ')'
scopy = scopy.replace(replacestr, '')
self.__parse(scopy)
if not clparmatch and not oparmatch:
if scopy: self.__rexplist.append(scopy)
def is_inbetween(self, l, elem):
""" Find out if an element is in between
in a list """
index = l.index(elem)
if index == 0:
return False
if index>2:
if index in range(1, len(l) -1):
return True
else:
return False
else:
return True
def __makenotexpr(self, s):
""" Make a NOT expression """
if s.find('!') == 0:
return ''.join(('(?!', s[1:], ')'))
else:
return s
def __makerexp(self, rexplist):
""" Make the regular expression string for
the boolean match from the nested list """
is_list = True
if type(rexplist) is str:
is_list = False
elem = rexplist
elif type(rexplist) is list:
elem = rexplist[0]
if type(elem) is list:
elem = elem[0]
eor = False
if not is_list or len(rexplist) == 1:
eor = True
word_str = '.*'
s=''
# Implementing NOT
if elem == '!':
return ''.join(('(?!', self.__makerexp(rexplist[1:]), ')'))
# Implementing OR
elif elem.find(' | ') != -1:
listofors = elem.split(' | ')
for o in listofors:
index = listofors.index(o)
in_bet = self.is_inbetween(listofors, o)
if o:
o = self.__makenotexpr(o)
if in_bet:
s = ''.join((s, '|', word_str, o, '.*'))
else:
s = ''.join((s, word_str, o, '.*'))
# Implementing AND
elif elem.find(' & ') != -1:
listofands = elem.split(' & ')
for a in listofands:
index = listofands.index(a)
in_bet = self.is_inbetween(listofands, a)
if a:
a = self.__makenotexpr(a)
s = ''.join((s, word_str, a, '.*'))
else:
if elem:
elem = self.__makenotexpr(elem)
s = ''.join((elem, '.*'))
if eor:
return s
else:
return ''.join((s, self.__makerexp(rexplist[1:])))
When the search phrase is as follows:
p = PyBoolRe('Python | Perl')
s1 = 'Guido invented Python'
s2 = 'Guido Perl'
if p.match(s1):
print 'Match found for first string'
else:
print 'No match found for first string'
if p.match(s2):
print 'Match found for second string'
else:
print 'No match found for second string'
Then both s1 & s2 match
But when the search phrase is:
p = PyBoolRe('Guido & (Python | Perl)')
s1 = 'Guido invented Python'
s2 = 'Guido Perl is great'
Then it should match if s1 or s2 has "Guido Python" or "Guido Perl". s2 has that but it does not match it. On the other hand, it matches s1, which it should not. Why is that?
Please help!! How can I get it to work??
Your generated expression is
.*Python.*|.*Perl.*.*Guido.*
while it should look like
(?=.*Guido.*)(?:.*Python.*|.*Perl.*)
So the parser needs some revision.
1) x|y should be enclosed into (?:...) (at least when used inside another block). Otherwise, | unluckily takes the global priority in the regexp.
2) x & y should be converted into (?=x)y (trailing context may be used to express the and between regular expressions)

Fastest and/or most pythonic way to convert string into a number [closed]

As it currently stands, this question is not a good fit for our Q&A format. We expect answers to be supported by facts, references, or expertise, but this question will likely solicit debate, arguments, polling, or extended discussion. If you feel that this question can be improved and possibly reopened, visit the help center for guidance.
Closed 9 years ago.
first of all, I've seen quite a few questions related to this (convert string to float, etc etc), but I need something more generic, which I could not find (so I hope this will also help out other people with a similar problem).
I have made a solution, but am wondering whether it is the best solution in terms of 1) performance and 2) pythonic elegance.
The problem in short:
I get data from a variety of sources, these are made into a list with
dicts (as a row/column table setup).
The variety means that I cannot
rely on a fixed input type (basically they might be string, boolean,
int, float) but the user can designate which columns (keys in the
dict) are values.
Which I then need to convert to actual value types (we're talking about 100s of millions of rows of data here, so performance is rather key).
If the input is not a real number (like: 'aaa'), then it should return None.
There might be currency symbols and thousand separators (which need to be removed), and decimal separators (which need to be replaced by the standard dot, if it's not a dot)
So what have I made:
import ast
import types
NumberTypes = (types.IntType, types.LongType, types.FloatType, types.ComplexType)
def mk_value(s, currency_sign='', thousand_sep='', decimal_sep='.'):
if isinstance(s, bool): # make boolean into a 0/1 value
if s:
result = 1
else:
result = 0
elif isinstance(s, NumberTypes): # keep numbers as/is
result = s
else: # convert a string
# prepare the string for conversion
if currency_sign != '':
s = s.replace(currency_sign, '')
if thousand_sep != '':
s = s.replace(thousand_sep, '')
if decimal_sep != '.':
s = s.replace(decimal_sep, '.')
s = s.strip()
# convert the string
if s == '':
result = None
else:
try:
# convert the string by a safe evaluation
result = ast.literal_eval(s)
# check if result of the evaluation is a number type
if not isinstance(result, NumberTypes):
result = None
except ValueError:
# if the conversion gave an error, the string is not a number
result = None
return result
You can test it by:
mk_value(True)
mk_value(1234)
mk_value(1234.56)
mk_value('1234')
mk_value('1234.56')
mk_value('1,234.56') # without an explicit decimal separator this is not a number
mk_value('1.234.567,89 EUR', currency_sign='EUR', thousand_sep='.', decimal_sep=',') # all exceptions
So this works (as far as I can see); but is this the best/most pythonic way? Are there faster ways? Should I look into Cython for this? Any ideas on improving this would be really helpful!
BR
Carst
Edit: I've updated my code based on the suggestions by Andrew and WoLpH. It now looks like this:
import types
NumberTypes = (types.IntType, types.LongType, types.FloatType, types.ComplexType)
def mk_value(s, currency_sign='', thousand_sep='', decimal_sep='.'):
if isinstance(s, bool): # make boolean into a 0/1 value
if s:
result = 1
else:
result = 0
elif isinstance(s, NumberTypes): # keep numbers as/is
result = s
else: # convert a string
# prepare the string for conversion
if currency_sign:
s = s.replace(currency_sign, '')
if thousand_sep:
s = s.replace(thousand_sep, '')
if decimal_sep != '.':
s = s.replace(decimal_sep, '.')
s = s.strip()
# convert the string
if not s: # if the string is empty, it's not a number
result = None
else:
try: # try int
result = int(s)
except ValueError:
try: # if there's an error, try float
result = float(s)
except ValueError:
# if the conversion gave an error, the string is not a number
result = None
return result
the previous code's performance was this:
>>> timeit.timeit("mk_value(1234)", 'from __main__ import mk_value', number=100000)
0.050575971603393555
>>> timeit.timeit("mk_value(1234.56)", 'from __main__ import mk_value', number=100000)
0.07073187828063965
>>> timeit.timeit("mk_value('1234')", 'from __main__ import mk_value', number=100000)
0.8333430290222168
>>> timeit.timeit("mk_value('1234.56')", 'from __main__ import mk_value', number=100000)
0.8230760097503662
>>> timeit.timeit("mk_value('1,234.56', thousand_sep=',')", 'from __main__ import mk_value', number=100000)
0.9358179569244385
the new code's performance:
>>> timeit.timeit("mk_value(1234)", 'from __main__ import mk_value', number=100000)
0.04723405838012695
>>> timeit.timeit("mk_value(1234.56)", 'from __main__ import mk_value', number=100000)
0.06952905654907227
>>> timeit.timeit("mk_value('1234')", 'from __main__ import mk_value', number=100000)
0.1798090934753418
>>> timeit.timeit("mk_value('1234.56')", 'from __main__ import mk_value', number=100000)
0.45616698265075684
>>> timeit.timeit("mk_value('1,234.56', thousand_sep=',')", 'from __main__ import mk_value', number=100000)
0.5290899276733398
So that's a lot faster: almost twice as fast for the most complex one and much much faster for the int (I guess as it's the first in the try/except logic)! Really great, thanks for your input.
I'm going to leave it open for now to see if someone has a brilliant idea on how to improve more :) At the very least I hope this will help other people in the future (it must be a very common issue)
It could be slightly more Pythonic imho, but I'm not sure about the best solution yet.
Code
benchmark.py
# vim: set fileencoding=utf-8 :
import timeit
import pyximport
pyximport.install()
def timer(func, mod):
import_ = 'from %s import mk_value' % mod
time = timeit.timeit(func, import_, number=100000)
ms = 1000 * time
us = 1000 * ms
if func[40:]:
func_short = func[:37] + '...'
else:
func_short = func
print '%(mod)s.%(func_short)-40s %(ms)6dms %(us)12dμs' % locals()
for mod in 'abcd':
timer("mk_value(1234)", mod)
timer("mk_value(1234.56)", mod)
timer("mk_value('1234')", mod)
timer("mk_value('1234.56')", mod)
timer("mk_value('1,234.56', thousand_sep=',')", mod)
timer("mk_value('1.234.567,89 EUR', currency_sign='EUR', "
"thousand_sep='.', decimal_sep=',')", mod)
a.py
import ast
import types
NumberTypes = (types.IntType, types.LongType, types.FloatType, types.ComplexType)
def mk_value(s, currency_sign='', thousand_sep='', decimal_sep='.'):
if isinstance(s, bool): # make boolean into a 0/1 value
if s:
result = 1
else:
result = 0
elif isinstance(s, NumberTypes): # keep numbers as/is
result = s
else: # convert a string
# prepare the string for conversion
if currency_sign != '':
s = s.replace(currency_sign, '')
if thousand_sep != '':
s = s.replace(thousand_sep, '')
if decimal_sep != '.':
s = s.replace(decimal_sep, '.')
s = s.strip()
# convert the string
if s == '':
result = None
else:
try:
# convert the string by a safe evaluation
result = ast.literal_eval(s)
# check if result of the evaluation is a number type
if not isinstance(result, NumberTypes):
result = None
except ValueError:
# if the conversion gave an error, the string is not a number
result = None
return result
b.py
import types
NumberTypes = (types.IntType, types.LongType, types.FloatType, types.ComplexType)
def mk_value(s, currency_sign='', thousand_sep='', decimal_sep='.'):
if isinstance(s, bool): # make boolean into a 0/1 value
if s:
result = 1
else:
result = 0
elif isinstance(s, NumberTypes): # keep numbers as/is
result = s
else: # convert a string
# prepare the string for conversion
if currency_sign:
s = s.replace(currency_sign, '')
if thousand_sep:
s = s.replace(thousand_sep, '')
if decimal_sep != '.':
s = s.replace(decimal_sep, '.')
s = s.strip()
# convert the string
if not s: # if the string is empty, it's not a number
result = None
else:
try: # try int
result = int(s)
except ValueError:
try: # if there's an error, try float
result = float(s)
except ValueError:
# if the conversion gave an error, the string is not a number
result = None
return result
c.pyx
import types
NumberTypes = (types.IntType, types.LongType, types.FloatType, types.ComplexType)
def mk_value(s, currency_sign='', thousand_sep='', decimal_sep='.'):
if isinstance(s, bool): # make boolean into a 0/1 value
if s:
result = 1
else:
result = 0
elif isinstance(s, NumberTypes): # keep numbers as/is
result = s
else: # convert a string
# prepare the string for conversion
if currency_sign:
s = s.replace(currency_sign, '')
if thousand_sep:
s = s.replace(thousand_sep, '')
if decimal_sep != '.':
s = s.replace(decimal_sep, '.')
s = s.strip()
# convert the string
if not s: # if the string is empty, it's not a number
result = None
else:
try: # try int
result = int(s)
except ValueError:
try: # if there's an error, try float
result = float(s)
except ValueError:
# if the conversion gave an error, the string is not a number
result = None
return result
d.pyx
import types
NumberTypes = (types.IntType, types.LongType, types.FloatType, types.ComplexType)
def mk_value(s, currency_sign='', thousand_sep='', decimal_sep='.'):
if isinstance(s, bool): # make boolean into a 0/1 value
if s:
result = 1
else:
result = 0
elif isinstance(s, NumberTypes): # keep numbers as/is
result = s
elif s:
if currency_sign:
s = s.replace(currency_sign, '')
result = _mk_value(s, currency_sign, thousand_sep, decimal_sep)
else:
result = None
return result
cdef object _mk_value(char *s, char *currency_sign, char *thousand_sep, char *decimal_sep):
cdef int i=0, j=0
result = None
while s[i]:
if s[i] == decimal_sep[0]:
s[j] = '.'
j += 1
elif s[i] == thousand_sep[0]:
pass
elif s[i] == ' ':
pass
else:
s[j] = s[i]
j += 1
i += 1
# convert the string
if not s: # if the string is empty, it's not a number
result = None
else:
try: # try int
result = int(s)
except ValueError:
try: # if there's an error, try float
result = float(s)
except ValueError:
# if the conversion gave an error, the string is not a number
pass
return result
Results
a.mk_value(1234) 27ms 27526μs
a.mk_value(1234.56) 42ms 42097μs
a.mk_value('1234') 502ms 502109μs
a.mk_value('1234.56') 520ms 520395μs
a.mk_value('1,234.56', thousand_sep=',') 570ms 570749μs
a.mk_value('1.234.567,89 EUR', currency... 627ms 627456μs
b.mk_value(1234) 27ms 27082μs
b.mk_value(1234.56) 40ms 40014μs
b.mk_value('1234') 94ms 94444μs
b.mk_value('1234.56') 276ms 276519μs
b.mk_value('1,234.56', thousand_sep=',') 315ms 315310μs
b.mk_value('1.234.567,89 EUR', currency... 374ms 374861μs
c.mk_value(1234) 11ms 11482μs
c.mk_value(1234.56) 22ms 22765μs
c.mk_value('1234') 69ms 69251μs
c.mk_value('1234.56') 176ms 176908μs
c.mk_value('1,234.56', thousand_sep=',') 226ms 226709μs
c.mk_value('1.234.567,89 EUR', currency... 285ms 285431μs
d.mk_value(1234) 11ms 11483μs
d.mk_value(1234.56) 22ms 22355μs
d.mk_value('1234') 69ms 69151μs
d.mk_value('1234.56') 169ms 169364μs
d.mk_value('1,234.56', thousand_sep=',') 187ms 187460μs
d.mk_value('1.234.567,89 EUR', currency... 233ms 233935μs
I would write it with early-out logic and raising an exception to indicate failure:
import types
NumberTypes = (types.IntType, types.LongType, types.FloatType, types.ComplexType)
def mk_value(s, currency_sign='', thousand_sep='', decimal_sep='.'):
if isinstance(s, NumberTypes):
# Already in the desired form.
return s
if isinstance(s, str):
# Prepare the string for conversion.
if currency_sign:
s = s.replace(currency_sign, '')
if thousand_sep:
s = s.replace(thousand_sep, '')
if decimal_sep != '.':
s = s.replace(decimal_sep, '.')
# stripping the string isn't necessary either...
# Convert the string.
# The empty string case is already handled;
# "special cases aren't special enough".
# This also handles bools naturally; might be slower,
# but bools oughtn't be the common case anyway.
try:
return int(s)
except ValueError:
return float(s)
# If that didn't work either, let the exception propagate.

How to ensure all string literals are unicode in python

I have a fairly large python code base to go through. It's got an issue where some string literals are strings and others are unicode. And this causes bugs. I am trying to convert everything to unicode. I was wondering if there is a tool that can convert all literals to unicode. I.e. if it found something like this:
print "result code %d" % result['code']
to:
print u"result code %d" % result[u'code']
If it helps I use PyCharm (in case there is an extension that does this), however I am would be happy to use a command like too as well. Hopefully such a tool exists.
You can use tokenize.generate_tokens break the string representation of Python code into tokens. tokenize also classifies the tokens for you. Thus you can identify string literals in Python code.
It is then not hard to manipulate the tokens, adding 'u' where desired:
import tokenize
import token
import io
import collections
class Token(collections.namedtuple('Token', 'num val start end line')):
#property
def name(self):
return token.tok_name[self.num]
def change_str_to_unicode(text):
result = text.splitlines()
# Insert a dummy line into result so indexing result
# matches tokenize's 1-based indexing
result.insert(0, '')
changes = []
for tok in tokenize.generate_tokens(io.BytesIO(text).readline):
tok = Token(*tok)
if tok.name == 'STRING' and not tok.val.startswith('u'):
changes.append(tok.start)
for linenum, s in reversed(changes):
line = result[linenum]
result[linenum] = line[:s] + 'u' + line[s:]
return '\n'.join(result[1:])
text = '''print "result code %d" % result['code']
# doesn't touch 'strings' in comments
'handles multilines' + \
'okay'
u'Unicode is not touched'
'''
print(change_str_to_unicode(text))
yields
print u"result code %d" % result[u'code']
# doesn't touch 'strings' in comments
u'handles multilines' + u'okay'
u'Unicode is not touched'
Try this (uses regex), and it's shorter than #unutbu's solution.
But there's s loop hole, the strings containing # won't work with this.
import re
scode = '''
print "'Hello World'" # prints 'Hello World'
u'Unicode is unchanged'"""
# so are "comments"'''
x1 = re.compile('''(?P<unicode>u?)(?P<c>'|")(?P<data>.*?)(?P=c)''')
def repl(m):
return "u%(c)s%(data)s%(c)s" % m.groupdict()
fcode = '\n'.join(
[re.sub(x1,repl,i)
if not '#' in i
else re.sub(x1,repl,i[:i.find('#')])+i[i.find('#'):]
for i in scode.splitlines()])
print fcode
Outputs:
print u"'Hello World'" # prints 'Hello World'
u'Unicode is unchanged'
# so are "comments"
For # I have this (and it's longer than #unutbu's solution :| )
import re
scode = '''print "'Hello World'" # prints 'Hello World'
u'Unicode is unchanged'
# so are "comments"
'#### Hi' # 'Hi' '''
x1 = re.compile('''(?P<unicode>u?)(?P<c>'|")(?P<data>.*?)(?P=c)''')
def in_string(text,index):
curr,in_l,in_str,level = '',0,False,[]
for c in text[:index+1]:
if c == '"' or c == "'":
if in_str and curr == c:
instr = False
curr = ''
in_l -= 1
else:
instr = True
curr = c
in_l += 1
level.append(in_l)
return bool(level[index])
def repl(m):
return "u%(c)s%(data)s%(c)s" % m.groupdict()
def handle_hashes(i):
if i.count('#') == 1:
n = i.find('#')
else:
n = get_hash_out_of_string(i)
return re.sub(x1,repl,i[:n]) + i[n:]
def get_hash_out_of_string(i):
n = i.find('#')
curr = i[:]
last = (len(i)-1)-''.join(list(reversed(i))).find('#')
while in_string(curr,n) and n < last:
curr = curr[:n]+' '+curr[n+1:]
n = curr.find('#')
return n
fcode = '\n'.join(
[re.sub(x1,repl,i)
if not '#' in i
else handle_hashes(i)
for i in scode.splitlines()])
print fcode
Output:
print u"'Hello World'" # prints 'Hello World'
u'Unicode is unchanged'
# so are "comments"
u'#### Hi' # 'Hi'

regex to replace regex

I have this regex for getting strings in Python code:
x1 = re.compile('''((?P<unicode>u?)(?P<c1>'|")(?P<data>.+?)(?P<c2>'|"))''')
I want to extract the data and c1,c2 parts of this regex to make a replace string (if c1 == c2)
Something like:
repl = "u<c1><data><c2>"
How can I do this??
Is that possible in one line or by using re.sub?
UPDATE:
My new code:
x1 = re.compile('''(?P<unicode>u?)(?P<c>'|")(?P<data>.*?)(?P=c)''')
def repl(match):
if '#' in match.string:
### Confused
return "u%(c)s%(data)s%(c)s" % m.groupdict()
fcode = '\n'.join([re.sub(x1,repl,i) for i in scode.splitlines()])
Here, I am having problems to determine how to not change strings in comments, what do I have to do to ignore the comments??
Say you have a pattern:
pattern = r'''(?P<unicode>u?)(?P<c>'|")(?P<data>.*?)(?P=c)''' # did a little tweak
Match a string:
m = re.search(pattern, "print('hello')")
What you got:
>>> m.groups()
('', '"', 'hello')
>>> m.groupdict()
{'c': '"', 'unicode': '', 'data': 'hello'}
Now you can do whatever you want with these:
>>> 'u{c}{data}{c}'.format_map(m.groupdict())
'u"hello"'
Maybe you are using Python 2.x:
>>> 'u{c}{data}{c}'.format(**m.groupdict())
'u"hello"'
Or even you like old %
>>> "u%(c)s%(data)s%(c)s" % m.groupdict()
'u"hello"'
Edited:
The regex solution can't handle some situations correctly.
So I used a 2to3 hack(it's actually 3to2, and still can't solve everything):
cd /usr/lib/python3.3/lib2to3/fixes/
cp fix_unicode.py fix_unicode33.py
Edit fix_unicode33.py
-_literal_re = re.compile(r"[uU][rR]?[\'\"]")
+_literal_re = re.compile(r"[rR]?[\'\"]")
-class FixUnicode(fixer_base.BaseFix):
+class FixUnicode33(fixer_base.BaseFix):
- new.value = new.value[1:]
+ new.value = 'u' + new.value
Now 2to3 --list | grep unicode33 should output unicode33
Then you can run 2to3 -f unicode33 py3files.py.
Remember to remove fix_unicode33.py after
NOTE: In Python3 ur"string" throws SyntaxError. The logic here is simple, modify it to reach your goal.
The long code I ended up with.
x1 = re.compile('''(?P<unicode>u?)(?P<c>'|")(?P<data>.*?)(?P=c)''')
def in_string(text,index):
curr,in_l,in_str,level = '',0,False,[]
for c in text[:index+1]:
if c == '"' or c == "'":
if in_str and curr == c:
instr = False
curr = ''
in_l -= 1
else:
instr = True
curr = c
in_l += 1
level.append(in_l)
return bool(level[index])
def repl(m):
return "u%(c)s%(data)s%(c)s" % m.groupdict()
def handle_hashes(i):
if i.count('#') == 1:
n = i.find('#')
else:
n = get_hash_out_of_string(i)
return re.sub(x1,repl,i[:n]) + i[n:]
def get_hash_out_of_string(i):
n = i.find('#')
curr = i[:]
last = (len(i)-1)-''.join(list(reversed(i))).find('#')
while in_string(curr,n) and n < last:
curr = curr[:n]+' '+curr[n+1:]
n = curr.find('#')
return n

Categories