I want to be able to pair up all parentheses in a string, if they aren't paired then then they get their index number and False. It seems like it is repeating some values over and over, i.e cl == pop[1]. I have tried to see where the problem is but I can't see it no matter how hard I try. So I'm asking if anyone help me to locate the error and maybe even improve my code ;)
def check_parentheses(string):
pending = 0
brackets = []
'''Checks if parens are paired, otherwise they are bad.'''
parenstack = collections.deque()
for ch in string:
if ch in lrmap:
try:
cl = string.index(ch, pending)
pending = cl + 1
except:
cl = False
if ch in lparens:
parenstack.append([ch, cl])
print parenstack
elif ch in rparens:
try:
pop = parenstack.pop()
if lrmap[pop[0]] != ch:
print 'wrong type of parenthesis popped from stack',\
pop[0], ch, pop[1], cl
brackets.append([pop[1], False])
brackets.append([cl, False])
else:
brackets.append([pop[1], cl])
except IndexError:
print 'no opening parenthesis left in stack'
brackets.append([cl, False])
# if we are not out of opening parentheses, we have a mismatch
for p in parenstack:
brackets.append([p[1],False])
return brackets
You can adapt my code to a similar question:
def Evaluate(str):
stack = []
pushChars, popChars = "<({[", ">)}]"
for c in str :
if c in pushChars :
stack.append(c)
elif c in popChars :
if not len(stack) :
return False
else :
stackTop = stack.pop()
balancingBracket = pushChars[popChars.index(c)]
if stackTop != balancingBracket :
return False
else :
return False
return not len(stack)
iparens = iter('(){}[]<>')
parens = dict(zip(iparens, iparens))
closing = parens.values()
def balanced(astr):
stack = []
for c in astr:
d = parens.get(c, None)
if d:
stack.append(d)
elif c in closing:
if not stack or c != stack.pop():
return False
return not stack
Example:
>>> balanced('[1<2>(3)]')
True
>>> balanced('[1<2(>3)]')
False
BRACES = { '(': ')', '[': ']', '{': '}' }
def group_check(s):
stack = []
for b in s:
c = BRACES.get(b)
if c:
stack.append(c)
elif not stack or stack.pop() != b:
return False
return not stack
Thanks hughdbrown your code was a breeze to get working and it's really short! You've just saved me a headache :D
converted it to pep8 if thats ok :)
Edit
Added support for comments and strings, it will not match inside them.
Added support for easy language brace checking, modify the charset dict.
Correctly paires up, i.e right to left
HTML
charset = dict(opening='{[(<',\
closing='}])>',\
string = ('"', "'"),\
comment=(('<!--', '-->')))
Python
charset = dict(opening='{[(<',\
closing='}])>',\
string = ('"', "'"),\
comment=(("'''", "'''"), ('"""', '"""'), ('#', '\n')))
C++
charset = dict(opening='{[(<',\
closing='}])>',\
string = ('"', "'"),\
comment=(('/*', '*/'), ('//', '\n')))
you get the point? :)
charset = dict(opening='{[(<',\
closing='}])>',\
string = ('"', "'"),\
comment=(('<!--', '-->'), ('"""', '"""'), ('#', '\n')))
allowed = ''.join([x[0][0] + x[1][0] for x in charset['comment']])
allowed += ''.join(charset['string'])
allowed += charset['opening']
allowed += charset['closing']
def brace_check(text):
o = []
c = []
notr = []
found = []
busy = False
last_pos = None
for i in xrange(len(text)):
ch = text[i]
if not busy:
cont = True
for comment in charset['comment']:
if ch == comment[0][0]:
como = text[i:len(comment[0])]
if como == comment[0]:
busy = comment[1]
if ch in charset['opening']:
last_pos = i
cont = False
break
if cont:
if ch in charset['string']:
busy = ch
elif ch in charset['opening']:
o.append((ch, i))
elif ch in charset['closing']:
c.append((ch, i))
else:
if ch == busy[0]:
if len(busy) == 1:
comc = ch
else:
comc = text[i:i + len(busy)]
if comc == busy:
if last_pos is not None:
if busy[-1] in charset['closing']:
found.append((last_pos, i))
last_pos = None
text = text[:i] + '\n' * len(comc) +\
text[i + len(comc):]
busy = not busy
elif busy in charset['string']:
if ch == '\n':
busy = not busy
for t, e in reversed(o):
try:
n = next((b, v) for b, v in c\
if b == charset['closing'][\
charset['opening'].find(t)] and v > e)
c.remove(n)
n = n[1]
if found != []:
if e < found[-1][0] and n > found[-1][0] and n < found[-1][1]\
or e < found[-1][1] and n > found[-1][1] and e > found[-1][0]:
found.append((n, False))
n = False
except StopIteration:
n = False
found.append((e, n))
for t, e in c:
found.append((e, False))
return found
An understandable solution in Python 3:
def check_balanced_string(str):
stack = []
dicc = {'(': ')', '[': ']', '{': '}'}
for char in str:
if char in dicc.keys(): # opening char
stack.append(char)
elif char in dicc.values(): # closing char
if dicc[stack[-1]] == char: # check if closing char corresponds to last opening char
stack.pop()
else:
return False
return not len(stack) # returns True when len == 0
eq = '{1+[3*5+(2+1)]}'
print(check_balanced_string(eq))
Try this:
def matched(s):
stack=[]
open,close="(",")"
for i in s:
if i in open:
stack.append(i)
if i in close:
if len(stack)==0:
return(False)
else:
stack.pop()
if len(stack):
return(False)
else:
return(True)
The below code will display the missing parentheses and the no of times missing in the given string.
from collections import Counter
def find_missing(str):
stack1 = []
stack2 = []
result = []
res_dict = {}
open_set = '<[{('
closed_set = '>]})'
a = list(str)
for i in a:
if i in open_set:
stack1.append(i)
elif i in closed_set:
stack2.append(i)
dict1 = Counter(stack1)
dict2 = Counter(stack2)
print(dict1)
print(dict2)
for i in open_set:
if dict1[i] > dict2[closed_set[open_set.index(i)]]:
res_dict[closed_set[open_set.index(i)]] = dict1[i] - dict2[closed_set[open_set.index(i)]]
result.append(closed_set[open_set.index(i)])
for i in closed_set:
if dict2[i] > dict1[open_set[closed_set.index(i)]]:
res_dict[open_set[closed_set.index(i)]] = dict2[i] - dict1[open_set[closed_set.index(i)]]
result.append(open_set[closed_set.index(i)])
return res_dict
# return result
if __name__ == '__main__':
str1 = '{This ((()bracket {[function]} <<going> crazy}'
x = find_missing(str1)
if len(x) > 0:
print("Imbalanced")
print(x)
else:
print("Balanced")
First we will scan the string from left to right, and every time we see an opening parenthesis we push it to a stack, because we want the last opening parenthesis to be closed first. (Remember the FILO structure of a stack!)
Then, when we see a closing parenthesis we check whether the last opened one is the corresponding closing match, by popping an element from the stack. If it’s a valid match, then we proceed forward, if not return false.
Code:
https://gist.github.com/i143code/51962bfb1bd5925f75007d4dcbcf7f55
I needed something for a recent project and figured I could build on the OP's solution a bit. It allows for comment patterns, quotes and brackets to be checked, whilst ignoring the surrounding text. I've purposefully made it more generic than it needs to be so that others can take what they want and cut out what they don't.
"""
This module is for testing bracket pairings within a given string
Tested with Python 3.5.4
>>> regexp = getRegexFromList(opening + closing)
>>> print(regexp)
(\\<\\-\\-|\\-\\-\\>|\\/\\*|\\/\\/|\\*\\/|\\#|\\"|\\'|\\(|\\[|\\{|\\<|\\\n|\\\n|\\"|\\'|\\)|\\]|\\}|\\>)
>>> test_string = 'l<--([0])-->1/*{<2>}*/3//<--4 &-->\\n5#"6"\\n7"/*(8)*/"9\'"10"\'11({12\ta})13[<14>]'
>>> patterns = re.findall(regexp, test_string)
>>> print(patterns)
['<--', '(', '[', ']', ')', '-->', '/*', '{', '<', '>', '}', '*/', '//', '<--', '-->', '\\n', '#', '"', '"', '\\n', '"', '/*', '(', ')', '*/', '"', '(', '{', '}', ')', '[', '<', '>', ']']
>>> doBracketsMatch(patterns)
True
>>> doBracketsMatch(['"', ')', '"', '[', ']', '\\''])
False
"""
# Dependencies
import re
# Global Variables
# Provide opening and closing patterns, along with their priorities & whether a priority is nestable
opening = ['<--', '/*', '//', '#', '"', '\'', '(', '[', '{', '<']
closing = ['-->', '*/', '\n', '\n', '"', '\'', ')', ']', '}', '>']
priority = [ 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]
nestable = {0: True, 1: False}
bracket_pairs = dict(zip(opening + closing, \
[[(closing + opening)[i], (priority + priority)[i]] \
for i in range(0, opening.__len__() * 2)]))
def getRegexFromList(listOfPatterns):
"""
Generate the search term for the regular expression
:param listOfPatterns:
:return:
>>> getRegexFromList(['"', '<--', '##', 'test'])
'(\\\\t\\\\e\\\\s\\\\t|\\\\<\\\\-\\\\-|\\\\#\\\\#|\\\\")'
"""
# Longer patterns first to prevent false negatives
search_terms = sorted(listOfPatterns, key=len, reverse=True)
regex = ""
for term in search_terms:
for char in str(term):
regex = regex + '\\' + char # Search for all characters literally
regex = regex + '|' # Search pattern = (a|b|c)
return '(' + regex[:-1] + ')' # Remove excess '|' and add brackets
def doBracketsMatch(list_of_brackets):
"""
Determine if brackets match up
:param list_of_brackets:
:return:
"""
stack = []
for bracket in list_of_brackets:
# Check empty stack conditions
if stack.__len__() is 0:
# Check for openings first to catch quotes
if bracket in opening:
stack.append(bracket)
elif bracket in closing:
return False
else:
continue
# Check for a matching bracket
elif bracket == bracket_pairs[stack[-1]][0]:
stack.pop()
# Ignore cases:
# - False positives
# - Lower priority brackets
# - Equal priority brackets if nesting is not allowed
elif bracket not in bracket_pairs or \
bracket_pairs[bracket][1] < bracket_pairs[stack[-1]][1] or \
(bracket_pairs[bracket][1] == bracket_pairs[stack[-1]][1] and \
not nestable[bracket_pairs[bracket][1]]):
continue
# New open bracket
elif bracket in opening:
stack.append(bracket)
# Otherwise, unpaired close bracket
else:
return False
# If stack isn't empty, then there is an unpaired open bracket
return not bool(stack)
if __name__ == '__main__':
import doctest
doctest.testmod()
Related
I am trying to write a program which checks balanced brackets for equation, my program is checking the brackets but its only looking for brackets and only give the right answer for the bracket but different answer for equation
My expected output is
exp1 = "(2+3)+(1-5)" # True
exp2 = "((3*2))*(7/3))" # False
exp3 = "(3*5))]" # False
My program below:
def is_valid(myStr):
""" Check the orders of the brackets
Returns True or False
"""
opening = ['(', '[', '{']
closing = [')', ']', '}']
stack = []
for i in myStr:
if i in opening:
stack.append(i)
elif i in closing:
pos = closing.index(i)
if ((len(stack) > 0) and
(opening[pos] == stack[len(stack)-1])):
stack.pop()
else:
return False
if len(stack) == 0:
return True
else:
return False
return
My program returning me False for all above equations, where am I doing wrong.
Found few bugs and improvements.
PS: It's better not to use i,j as variable but some meaningful names such as ele, element etc.
def is_valid(myStr):
""" Check the orders of the brackets
Returns True or False
"""
opening = ['(', '[', '{']
closing = [')', ']', '}']
stack = []
for i in myStr:
if i in opening:
stack.append(i)
elif i in closing:
pos = closing.index(i)
if ((len(stack) > 0) and
(opening[pos] == stack[-1])):
stack.pop()
else:
stack.append(i)
else:
pass
if len(stack) == 0:
return True
else:
return False
print(is_valid('(2+3)+(1-5)'))
print(is_valid('((3*2))*(7/3))'))
print(is_valid('(3*5))]'))
# True
# False
# False
The last if-else statements in your program checking if the stack length is 0 should be outside the for loop. I have changed the code for your reference and checked it with your examples. It worked fine.
def is_valid(myStr):
opening = ['(', '[', '{']
closing = [')', ']', '}']
stack = []
for i in myStr:
if i in opening:
stack.append(i)
print(stack)
elif i in closing:
pos = closing.index(i)
if ((len(stack) > 0) and
(opening[pos] == stack[len(stack)-1])):
stack.pop()
else:
return False
if len(stack) == 0:
return True
else:
return False
in Javascript;
const brackets = new Map()
brackets.set('{', '}');
brackets.set('(', ')');
brackets.set('[', ']');
function baz(str) {
let temp = []
for (let i = 0; i < str.length; i++) {
if (brackets.has(str[i])) {
temp.push(str[i])
}
else if (brackets.get(temp?.findLast((item) => item)) === str[i]) {
temp.pop()
}
else {
temp.push(str[i])
}
}
return !!temp.length
}
baz('({[([]())]})')
baz('{[]}')
baz('{(])}')
baz('{([)]}')
The leetcode question is :
Given a string s containing just the characters '(', ')', '{', '}', '[' and ']', determine if the input string is valid.
An input string is valid if:
Open brackets must be closed by the same type of brackets.
Open brackets must be closed in the correct order.
My code:
class Solution:
def isValid(self, s: str) -> bool:
mapper = {')':'(',
']':'[',
'}':'{'}
stack = []
top_element = -1
if not s:
return False
for char in s:
if char in mapper and top_element == -1:
return False
if char in mapper and mapper[char] == top_element:
stack.pop()
else:
stack.append(char)
top_element = stack[-1]
return not stack
The logic works for '()' input but not for '{[]}'. I think the error is in the if-else condition
What am I doing wrong?
There are several things:
class Solution:
def isValid(self, s: str) -> bool:
^^^^
Does this function need to be non-static?
mapper = {')':'(',
']':'[',
'}':'{'}
stack = []
top_element = -1
if not s:
return False
for char in s:
if char in mapper and top_element == -1:
return False
if char in mapper and mapper[char] == top_element:
stack.pop()
# Change top element
top_element = stack[-1] if stack else -1
You need to change top_element after pop.
What if parenthesis is closing, but it doesn't match with top_element?
elif char in mapper and mapper[char] != top_element:
return False
else:
stack.append(char)
top_element = stack[-1]
return not stack
Also you could check if s contains other symbols than parenthesis, just in case.
The logic and your code is good, you just need to update topelement after the pop.(Change this:
if char in mapper and mapper[char] == top_element:
stack.pop()
else:
stack.append(char)
top_element = stack[-1] if len(stack)>0 else None
To this (with one more line the commented one):
if char in mapper and mapper[char] == top_element:
stack.pop()
top_element = stack[-1] #additional line
else:
stack.append(char)
top_element = stack[-1]
You're in the right path.
We can just simply add a sentinel value to the stack at initialization.
This'd pass through:
class Solution:
def isValid(self, base_string):
memo = {')': '(', '}': '{', ']': '['}
stack = [0]
for character in base_string:
if character in memo:
if stack.pop() != memo[character]:
return False
else:
stack.append(character)
return stack == [0]
I have the following python script which does regex matching using 'AND', 'OR' features as well:
class PyBoolReException(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return str(self.value)
class PyBoolRe:
def __init__(self, boolstr):
# Require whitespace before words?
self.__needspace = True
# whitespace re
self._wspre = re.compile('^\s*$')
# create regexp string
self.__rexplist = []
oparct = boolstr.count('(')
clparct = boolstr.count(')')
if oparct != clparct:
raise PyBoolReException, 'Mismatched parantheses!'
self.__parse(boolstr)
# if NOT is one of the members, reverse
# the list
# print self.__rexplist
if '!' in self.__rexplist:
self.__rexplist.reverse()
s = self.__makerexp(self.__rexplist)
# print s
self.__rexp = re.compile(s)
def match(self, data):
""" Match the boolean expression, behaviour
is same as the 'match' method of re """
return self.__rexp.match(data)
def search(self, data):
""" Search the boolean expression, behaviour
is same as the 'search' method of re """
return self.__rexp.search(data)
def __parse(self, s):
""" Parse the boolean regular expression string
and create the regexp list """
# The string is a nested parantheses with
# any character in between the parens.
scopy = s[:]
oparmatch, clparmatch = False, False
# Look for a NOT expression
index = scopy.rfind('(')
l = []
if index != -1:
oparmatch = True
index2 = scopy.find(')', index)
if index2 != -1:
clparmatch = True
newstr = scopy[index+1:index2]
# if the string is only of whitespace chars, skip it
if not self._wspre.match(newstr):
self.__rexplist.append(newstr)
replacestr = '(' + newstr + ')'
scopy = scopy.replace(replacestr, '')
self.__parse(scopy)
if not clparmatch and not oparmatch:
if scopy: self.__rexplist.append(scopy)
def is_inbetween(self, l, elem):
""" Find out if an element is in between
in a list """
index = l.index(elem)
if index == 0:
return False
if index>2:
if index in range(1, len(l) -1):
return True
else:
return False
else:
return True
def __makenotexpr(self, s):
""" Make a NOT expression """
if s.find('!') == 0:
return ''.join(('(?!', s[1:], ')'))
else:
return s
def __makerexp(self, rexplist):
""" Make the regular expression string for
the boolean match from the nested list """
is_list = True
if type(rexplist) is str:
is_list = False
elem = rexplist
elif type(rexplist) is list:
elem = rexplist[0]
if type(elem) is list:
elem = elem[0]
eor = False
if not is_list or len(rexplist) == 1:
eor = True
word_str = '.*'
s=''
# Implementing NOT
if elem == '!':
return ''.join(('(?!', self.__makerexp(rexplist[1:]), ')'))
# Implementing OR
elif elem.find(' | ') != -1:
listofors = elem.split(' | ')
for o in listofors:
index = listofors.index(o)
in_bet = self.is_inbetween(listofors, o)
if o:
o = self.__makenotexpr(o)
if in_bet:
s = ''.join((s, '|', word_str, o, '.*'))
else:
s = ''.join((s, word_str, o, '.*'))
# Implementing AND
elif elem.find(' & ') != -1:
listofands = elem.split(' & ')
for a in listofands:
index = listofands.index(a)
in_bet = self.is_inbetween(listofands, a)
if a:
a = self.__makenotexpr(a)
s = ''.join((s, word_str, a, '.*'))
else:
if elem:
elem = self.__makenotexpr(elem)
s = ''.join((elem, '.*'))
if eor:
return s
else:
return ''.join((s, self.__makerexp(rexplist[1:])))
When the search phrase is as follows:
p = PyBoolRe('Python | Perl')
s1 = 'Guido invented Python'
s2 = 'Guido Perl'
if p.match(s1):
print 'Match found for first string'
else:
print 'No match found for first string'
if p.match(s2):
print 'Match found for second string'
else:
print 'No match found for second string'
Then both s1 & s2 match
But when the search phrase is:
p = PyBoolRe('Guido & (Python | Perl)')
s1 = 'Guido invented Python'
s2 = 'Guido Perl is great'
Then it should match if s1 or s2 has "Guido Python" or "Guido Perl". s2 has that but it does not match it. On the other hand, it matches s1, which it should not. Why is that?
Please help!! How can I get it to work??
Your generated expression is
.*Python.*|.*Perl.*.*Guido.*
while it should look like
(?=.*Guido.*)(?:.*Python.*|.*Perl.*)
So the parser needs some revision.
1) x|y should be enclosed into (?:...) (at least when used inside another block). Otherwise, | unluckily takes the global priority in the regexp.
2) x & y should be converted into (?=x)y (trailing context may be used to express the and between regular expressions)
I have this regex for getting strings in Python code:
x1 = re.compile('''((?P<unicode>u?)(?P<c1>'|")(?P<data>.+?)(?P<c2>'|"))''')
I want to extract the data and c1,c2 parts of this regex to make a replace string (if c1 == c2)
Something like:
repl = "u<c1><data><c2>"
How can I do this??
Is that possible in one line or by using re.sub?
UPDATE:
My new code:
x1 = re.compile('''(?P<unicode>u?)(?P<c>'|")(?P<data>.*?)(?P=c)''')
def repl(match):
if '#' in match.string:
### Confused
return "u%(c)s%(data)s%(c)s" % m.groupdict()
fcode = '\n'.join([re.sub(x1,repl,i) for i in scode.splitlines()])
Here, I am having problems to determine how to not change strings in comments, what do I have to do to ignore the comments??
Say you have a pattern:
pattern = r'''(?P<unicode>u?)(?P<c>'|")(?P<data>.*?)(?P=c)''' # did a little tweak
Match a string:
m = re.search(pattern, "print('hello')")
What you got:
>>> m.groups()
('', '"', 'hello')
>>> m.groupdict()
{'c': '"', 'unicode': '', 'data': 'hello'}
Now you can do whatever you want with these:
>>> 'u{c}{data}{c}'.format_map(m.groupdict())
'u"hello"'
Maybe you are using Python 2.x:
>>> 'u{c}{data}{c}'.format(**m.groupdict())
'u"hello"'
Or even you like old %
>>> "u%(c)s%(data)s%(c)s" % m.groupdict()
'u"hello"'
Edited:
The regex solution can't handle some situations correctly.
So I used a 2to3 hack(it's actually 3to2, and still can't solve everything):
cd /usr/lib/python3.3/lib2to3/fixes/
cp fix_unicode.py fix_unicode33.py
Edit fix_unicode33.py
-_literal_re = re.compile(r"[uU][rR]?[\'\"]")
+_literal_re = re.compile(r"[rR]?[\'\"]")
-class FixUnicode(fixer_base.BaseFix):
+class FixUnicode33(fixer_base.BaseFix):
- new.value = new.value[1:]
+ new.value = 'u' + new.value
Now 2to3 --list | grep unicode33 should output unicode33
Then you can run 2to3 -f unicode33 py3files.py.
Remember to remove fix_unicode33.py after
NOTE: In Python3 ur"string" throws SyntaxError. The logic here is simple, modify it to reach your goal.
The long code I ended up with.
x1 = re.compile('''(?P<unicode>u?)(?P<c>'|")(?P<data>.*?)(?P=c)''')
def in_string(text,index):
curr,in_l,in_str,level = '',0,False,[]
for c in text[:index+1]:
if c == '"' or c == "'":
if in_str and curr == c:
instr = False
curr = ''
in_l -= 1
else:
instr = True
curr = c
in_l += 1
level.append(in_l)
return bool(level[index])
def repl(m):
return "u%(c)s%(data)s%(c)s" % m.groupdict()
def handle_hashes(i):
if i.count('#') == 1:
n = i.find('#')
else:
n = get_hash_out_of_string(i)
return re.sub(x1,repl,i[:n]) + i[n:]
def get_hash_out_of_string(i):
n = i.find('#')
curr = i[:]
last = (len(i)-1)-''.join(list(reversed(i))).find('#')
while in_string(curr,n) and n < last:
curr = curr[:n]+' '+curr[n+1:]
n = curr.find('#')
return n
I would like to split a string similar to
'abc "defg hijk \\"l; mn\\" opqrs"; tuv'
into
(['abc', '"defg hijk \\"l; mn\\" opqrs"'], 33)
i.e. I don't want to break on semicolon inside (nested) quotes. What's the easiest way, tokenize? It doesn't hurt if it's fast, but short is better.
Edit: I forgot one more detail that makes it even more tricky. I need the position of the semicolon that is cutting off the string, or -1 if there is none. (I'm doing changes to legacy code that used to be recursive, but stackoverflowed when the string became very long.)
It's unlikely there is an easy way to solve this without a proper parser. You could probably get away with a hand built parser that doesn't require tokenizing though.
Something like the following should be a good guide:
def parse(s):
cur_s = []
strings = []
def flush_string():
strings.push(''.join(cur_s))
cur_s = []
def handle_special_cases():
# TODO: Fill this in
for c in s:
if c == ';':
break
elif c in ['\\' '"']:
handle_special_cases()
elif c == ' ':
flush_string()
else:
cur_s.push(c)
flush_string()
return strings
It's a stateful search, so simple stateless operations are not available. Here's a simple char-by-char stateful evaluator that might meet your "short" without resorting to full tokenization/parsing:
#!/usr/bin/env python
inp="""abc "defg hijk \\"l; mn\\" opqrs"; tuv'`"""
def words_to_semi(inpstr):
ret = ['']
st8 = 1 # state: 1=reg, 2=in quotes, 3=escaped quote, 4=escaped reg, 0=end
ops = { 1 : {' ': lambda c: (None,1),
'"': lambda c: (c,2),
';': lambda c: ('',0),
'\\': lambda c: (c,4),
},
2 : {'\\': lambda c: (c,3),
'"': lambda c: (c,1),
},
3 : {None: lambda c: (c,2)},
4 : {None: lambda c: (c,1)},
}
pos = 0
for C in inpstr:
oc,st8 = ops[st8].get(C, ops[st8].get(None, lambda c:(c,st8)))(C)
if not st8: break
if oc is None:
ret.append('')
else:
ret[-1] += oc
pos = pos + 1
return ret, pos
print str(words_to_semi(inp))
Just modify the ops dict (and add new states) to handle other cases; everything else is generic code.
Here's the brute-force method I went with. Brrr...
def f(s):
instr = False
inescape = False
a = ''
rs = []
cut_index = -1
for idx,ch in enumerate(s):
if instr:
a += ch
if inescape:
inescape = False
elif ch == '\\':
inescape = True
elif ch == '"':
if a:
rs += [a]
a = ''
instr = False
elif ch == '"':
if a:
rs += [a]
a = ch
instr = True
elif ch == ';':
if a:
rs += [a]
cut_index = idx
break
elif ch == ' ' or ch == '\t' or ch == '\n':
if a:
rs += [a]
a = ''
else:
a += ch
return rs, cut_index
f('abc "defg hijk \\"l; mn\\" opqrs"; tuv')