Python Regex matching wrong strings

Python Regex matching wrong strings - python

I have the following python script which does regex matching using 'AND', 'OR' features as well:
class PyBoolReException(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return str(self.value)
class PyBoolRe:
def __init__(self, boolstr):
# Require whitespace before words?
self.__needspace = True
# whitespace re
self._wspre = re.compile('^\s*$')
# create regexp string
self.__rexplist = []
oparct = boolstr.count('(')
clparct = boolstr.count(')')
if oparct != clparct:
raise PyBoolReException, 'Mismatched parantheses!'
self.__parse(boolstr)
# if NOT is one of the members, reverse
# the list
# print self.__rexplist
if '!' in self.__rexplist:
self.__rexplist.reverse()
s = self.__makerexp(self.__rexplist)
# print s
self.__rexp = re.compile(s)
def match(self, data):
""" Match the boolean expression, behaviour
is same as the 'match' method of re """
return self.__rexp.match(data)
def search(self, data):
""" Search the boolean expression, behaviour
is same as the 'search' method of re """
return self.__rexp.search(data)
def __parse(self, s):
""" Parse the boolean regular expression string
and create the regexp list """
# The string is a nested parantheses with
# any character in between the parens.
scopy = s[:]
oparmatch, clparmatch = False, False
# Look for a NOT expression
index = scopy.rfind('(')
l = []
if index != -1:
oparmatch = True
index2 = scopy.find(')', index)
if index2 != -1:
clparmatch = True
newstr = scopy[index+1:index2]
# if the string is only of whitespace chars, skip it
if not self._wspre.match(newstr):
self.__rexplist.append(newstr)
replacestr = '(' + newstr + ')'
scopy = scopy.replace(replacestr, '')
self.__parse(scopy)
if not clparmatch and not oparmatch:
if scopy: self.__rexplist.append(scopy)
def is_inbetween(self, l, elem):
""" Find out if an element is in between
in a list """
index = l.index(elem)
if index == 0:
return False
if index>2:
if index in range(1, len(l) -1):
return True
else:
return False
else:
return True
def __makenotexpr(self, s):
""" Make a NOT expression """
if s.find('!') == 0:
return ''.join(('(?!', s[1:], ')'))
else:
return s
def __makerexp(self, rexplist):
""" Make the regular expression string for
the boolean match from the nested list """
is_list = True
if type(rexplist) is str:
is_list = False
elem = rexplist
elif type(rexplist) is list:
elem = rexplist[0]
if type(elem) is list:
elem = elem[0]
eor = False
if not is_list or len(rexplist) == 1:
eor = True
word_str = '.*'
s=''
# Implementing NOT
if elem == '!':
return ''.join(('(?!', self.__makerexp(rexplist[1:]), ')'))
# Implementing OR
elif elem.find(' | ') != -1:
listofors = elem.split(' | ')
for o in listofors:
index = listofors.index(o)
in_bet = self.is_inbetween(listofors, o)
if o:
o = self.__makenotexpr(o)
if in_bet:
s = ''.join((s, '|', word_str, o, '.*'))
else:
s = ''.join((s, word_str, o, '.*'))
# Implementing AND
elif elem.find(' & ') != -1:
listofands = elem.split(' & ')
for a in listofands:
index = listofands.index(a)
in_bet = self.is_inbetween(listofands, a)
if a:
a = self.__makenotexpr(a)
s = ''.join((s, word_str, a, '.*'))
else:
if elem:
elem = self.__makenotexpr(elem)
s = ''.join((elem, '.*'))
if eor:
return s
else:
return ''.join((s, self.__makerexp(rexplist[1:])))
When the search phrase is as follows:
p = PyBoolRe('Python | Perl')
s1 = 'Guido invented Python'
s2 = 'Guido Perl'
if p.match(s1):
print 'Match found for first string'
else:
print 'No match found for first string'
if p.match(s2):
print 'Match found for second string'
else:
print 'No match found for second string'
Then both s1 & s2 match
But when the search phrase is:
p = PyBoolRe('Guido & (Python | Perl)')
s1 = 'Guido invented Python'
s2 = 'Guido Perl is great'
Then it should match if s1 or s2 has "Guido Python" or "Guido Perl". s2 has that but it does not match it. On the other hand, it matches s1, which it should not. Why is that?
Please help!! How can I get it to work??

Your generated expression is
.*Python.*|.*Perl.*.*Guido.*
while it should look like
(?=.*Guido.*)(?:.*Python.*|.*Perl.*)
So the parser needs some revision.
1) x|y should be enclosed into (?:...) (at least when used inside another block). Otherwise, | unluckily takes the global priority in the regexp.
2) x & y should be converted into (?=x)y (trailing context may be used to express the and between regular expressions)

Related

What's wrong with recursive Regex function code in python

I wrote a regex code which compares two strings. It recognises a special character '?' that allows zero or more instances of previous character. It works fine until there are two or more occasions of '?' in the string. And I can't make out why.
def single_character_string(a, b) -> "return True if characters match":
"""check if two characters match"""
if len(a) == 0:
return True
elif len(b) == 0:
return False
else:
if a == '.':
return True
else:
if a == b:
return True
else:
return False
def meta_question_result(temp):
if len(temp) >= 2:
if temp[1] == '?':
k_1 = temp.replace(temp[0: 2], '') # no char
k_2 = temp.replace(temp[1], '') # char
return k_1, k_2
def check_pair_by_pair(template, check_string) -> "Strings are of Equal length! " \
"return True if lines are identical":
"""check if two strings match symbol by symbol. template may be less than string, the opposite
is False"""
if not template: # exit from recursion
return True
if not check_string: # exit from recursion
return False
if meta_question_result(template):
t_1, t_2 = meta_question_result(template)
if single_character_string(t_1[0], check_string[0]):
return check_pair_by_pair(t_1[1:], check_string[1:])
if single_character_string(t_2[0], check_string[0]):
return check_pair_by_pair(t_2[1:], check_string[1:])
else:
return False
elif single_character_string(template[0], check_string[0]):
return check_pair_by_pair(template[1:], check_string[1:])
else:
return False
reg, st = input().split("|")
print(check_pair_by_pair(reg, st))
reg = "co?lou?r"
st = "colour"
gives True as expected,
reg = "co?lou?r"
st = "clor"
gives True as expected,
but...
reg = "co?lou?r"
st = "color"
gives False. I expected True.

Found the bag.
Replace method replaces all instances of '?'. So the second '?' was replaced also and program didn't see it.
I should add an argument 'count' to replace method that is equal to 1.
k_1 = temp.replace(temp[0: 2], '', 1) # no char

Balanced expression with replacement

Given a string that contains only the following => ‘{‘, ‘}’, ‘(‘, ‘)’, ‘[’, ‘]’. At some places there is ‘X’ in place of any bracket. Determine whether by replacing all ‘X’s with appropriate bracket, is it possible to make a valid bracket sequence.
Examples:
Input : S = "{(X[X])}"
Output : Balanced
Input : S = "[{X}(X)]"
Output : Not balanced
I tried to work it out like this, and it works for examples above. But it doesn't work for all examples eg. (it should be balanced but it says it's not)
Input: S = "([X}])"
Output: Not balanced
I tried to work it out but i can't find a solution. Please help.
class Stack:
def __init__(self):
self.data = []
def insert(self, x):
self.data.append(x)
def empty(self):
return len(self.data) == 0
def remove(self):
if self.empty():
raise ValueError('Stack is empty.')
self.data.pop()
def top_element(self):
if self.empty():
raise ValueError('Stack is empty.')
return self.data[-1]
def is_matching(a, b):
if a == "(" and b == ")":
return True
elif a == "[" and b == "]":
return True
elif a == "{" and b == "}":
return True
elif a == "X":
return True
return False
def is_balanced(expression,elements=Stack(),ind=0):
if ind == len(expression):
return elements.empty()
pre_brackets = "([{"
post_brackets = ")]}"
char = expression[ind]
if char in pre_brackets:
elements.insert(char)
return is_balanced(expression,elements,ind+1)
elif char in post_brackets:
if elements.empty() :
return False
if not is_matching(elements.top_element(), char):
return False
elements.remove()
return is_balanced(expression,elements,ind+1)
elif char == "X":
temp = Stack()
temp.insert(char)
result = (is_balanced(expression,temp,ind+1))
if result:
return True
if elements.empty():
return False
elements.remove()
return is_balanced(expression,elements,ind+1)
expression = "([X}])"
if expression == "":
print("No brackets in expression!")
elif len(expression) % 2 != 0:
print("Not balanced")
elif is_balanced(expression):
print("Balanced")
else:
print("Not Balanced")

You can do it by recursively testing all possible replacements for an X:
def can_be_balanced(expr):
pairs = "{}[]()"
opening_brackets = pairs[::2]
closing_brackets = pairs[1::2]
closer = {o:c for o, c in zip(opening_brackets, closing_brackets)}
opener = {c:o for o, c in zip(opening_brackets, closing_brackets)}
stack = []
for item in expr:
if item in opening_brackets:
# we append opening brackets to the stack
stack.append(item)
elif item in closing_brackets:
if not stack or stack[-1] != opener[item]:
# the closing bracket doesn't match the top of the stack
return False
else:
# if it does, we remove the matching opening bracket
stack.pop()
elif item == 'X':
# X could be any of the opening brackets,
possible = list(opening_brackets)
if stack and stack[-1] in opening_brackets:
# or the closing bracket matching the top of the stack
possible.append(closer[stack[-1]])
for pos in possible:
# we replace this X, the first one remaining in expr
test_expr = expr.replace('X', pos, 1)
if can_be_balanced(test_expr):
# This is just in order to print the working solution we just found,
# you may remove these two lines
if not 'X' in test_expr:
print(test_expr)
return True
# None of the replacements for X gave a balanced expression
return False
else:
raise ValueError(f'Invalid item {item} in {expr}')
# The expression is balanced if and only if the stack ends up empty
return not stack
Testing on your sample data:
tests = [("[{X}(X)]", False),
("{(X[X])}", True),
("([X}])", True),
]
for test in tests:
print(test[0], ': should be', test[1])
print(can_be_balanced(test[0]))
print('-'*20)
correctly outputs (with the balanced expression in case it can be done):
[{X}(X)] : should be False
False
--------------------
{(X[X])} : should be True
{([[]])}
True
--------------------
([X}]) : should be True
([{}])
True
--------------------
Note that a major problem in your code is that you only test the end of the expression, starting at the position of the X. Beware also of the mutable default argument elements=Stack() that would leave you with the remnants of the previous call to the function instead of a fresh, empty Stack object.

regex to replace regex

I have this regex for getting strings in Python code:
x1 = re.compile('''((?P<unicode>u?)(?P<c1>'|")(?P<data>.+?)(?P<c2>'|"))''')
I want to extract the data and c1,c2 parts of this regex to make a replace string (if c1 == c2)
Something like:
repl = "u<c1><data><c2>"
How can I do this??
Is that possible in one line or by using re.sub?
UPDATE:
My new code:
x1 = re.compile('''(?P<unicode>u?)(?P<c>'|")(?P<data>.*?)(?P=c)''')
def repl(match):
if '#' in match.string:
### Confused
return "u%(c)s%(data)s%(c)s" % m.groupdict()
fcode = '\n'.join([re.sub(x1,repl,i) for i in scode.splitlines()])
Here, I am having problems to determine how to not change strings in comments, what do I have to do to ignore the comments??

Say you have a pattern:
pattern = r'''(?P<unicode>u?)(?P<c>'|")(?P<data>.*?)(?P=c)''' # did a little tweak
Match a string:
m = re.search(pattern, "print('hello')")
What you got:
>>> m.groups()
('', '"', 'hello')
>>> m.groupdict()
{'c': '"', 'unicode': '', 'data': 'hello'}
Now you can do whatever you want with these:
>>> 'u{c}{data}{c}'.format_map(m.groupdict())
'u"hello"'
Maybe you are using Python 2.x:
>>> 'u{c}{data}{c}'.format(**m.groupdict())
'u"hello"'
Or even you like old %
>>> "u%(c)s%(data)s%(c)s" % m.groupdict()
'u"hello"'
Edited:
The regex solution can't handle some situations correctly.
So I used a 2to3 hack(it's actually 3to2, and still can't solve everything):
cd /usr/lib/python3.3/lib2to3/fixes/
cp fix_unicode.py fix_unicode33.py
Edit fix_unicode33.py
-_literal_re = re.compile(r"[uU][rR]?[\'\"]")
+_literal_re = re.compile(r"[rR]?[\'\"]")
-class FixUnicode(fixer_base.BaseFix):
+class FixUnicode33(fixer_base.BaseFix):
- new.value = new.value[1:]
+ new.value = 'u' + new.value
Now 2to3 --list | grep unicode33 should output unicode33
Then you can run 2to3 -f unicode33 py3files.py.
Remember to remove fix_unicode33.py after
NOTE: In Python3 ur"string" throws SyntaxError. The logic here is simple, modify it to reach your goal.

The long code I ended up with.
x1 = re.compile('''(?P<unicode>u?)(?P<c>'|")(?P<data>.*?)(?P=c)''')
def in_string(text,index):
curr,in_l,in_str,level = '',0,False,[]
for c in text[:index+1]:
if c == '"' or c == "'":
if in_str and curr == c:
instr = False
curr = ''
in_l -= 1
else:
instr = True
curr = c
in_l += 1
level.append(in_l)
return bool(level[index])
def repl(m):
return "u%(c)s%(data)s%(c)s" % m.groupdict()
def handle_hashes(i):
if i.count('#') == 1:
n = i.find('#')
else:
n = get_hash_out_of_string(i)
return re.sub(x1,repl,i[:n]) + i[n:]
def get_hash_out_of_string(i):
n = i.find('#')
curr = i[:]
last = (len(i)-1)-''.join(list(reversed(i))).find('#')
while in_string(curr,n) and n < last:
curr = curr[:n]+' '+curr[n+1:]
n = curr.find('#')
return n

Splitting C string in Python

I would like to split a string similar to
'abc "defg hijk \\"l; mn\\" opqrs"; tuv'
into
(['abc', '"defg hijk \\"l; mn\\" opqrs"'], 33)
i.e. I don't want to break on semicolon inside (nested) quotes. What's the easiest way, tokenize? It doesn't hurt if it's fast, but short is better.
Edit: I forgot one more detail that makes it even more tricky. I need the position of the semicolon that is cutting off the string, or -1 if there is none. (I'm doing changes to legacy code that used to be recursive, but stackoverflowed when the string became very long.)

It's unlikely there is an easy way to solve this without a proper parser. You could probably get away with a hand built parser that doesn't require tokenizing though.
Something like the following should be a good guide:
def parse(s):
cur_s = []
strings = []
def flush_string():
strings.push(''.join(cur_s))
cur_s = []
def handle_special_cases():
# TODO: Fill this in
for c in s:
if c == ';':
break
elif c in ['\\' '"']:
handle_special_cases()
elif c == ' ':
flush_string()
else:
cur_s.push(c)
flush_string()
return strings

It's a stateful search, so simple stateless operations are not available. Here's a simple char-by-char stateful evaluator that might meet your "short" without resorting to full tokenization/parsing:
#!/usr/bin/env python
inp="""abc "defg hijk \\"l; mn\\" opqrs"; tuv'`"""
def words_to_semi(inpstr):
ret = ['']
st8 = 1 # state: 1=reg, 2=in quotes, 3=escaped quote, 4=escaped reg, 0=end
ops = { 1 : {' ': lambda c: (None,1),
'"': lambda c: (c,2),
';': lambda c: ('',0),
'\\': lambda c: (c,4),
},
2 : {'\\': lambda c: (c,3),
'"': lambda c: (c,1),
},
3 : {None: lambda c: (c,2)},
4 : {None: lambda c: (c,1)},
}
pos = 0
for C in inpstr:
oc,st8 = ops[st8].get(C, ops[st8].get(None, lambda c:(c,st8)))(C)
if not st8: break
if oc is None:
ret.append('')
else:
ret[-1] += oc
pos = pos + 1
return ret, pos
print str(words_to_semi(inp))
Just modify the ops dict (and add new states) to handle other cases; everything else is generic code.

Here's the brute-force method I went with. Brrr...
def f(s):
instr = False
inescape = False
a = ''
rs = []
cut_index = -1
for idx,ch in enumerate(s):
if instr:
a += ch
if inescape:
inescape = False
elif ch == '\\':
inescape = True
elif ch == '"':
if a:
rs += [a]
a = ''
instr = False
elif ch == '"':
if a:
rs += [a]
a = ch
instr = True
elif ch == ';':
if a:
rs += [a]
cut_index = idx
break
elif ch == ' ' or ch == '\t' or ch == '\n':
if a:
rs += [a]
a = ''
else:
a += ch
return rs, cut_index
f('abc "defg hijk \\"l; mn\\" opqrs"; tuv')

Parentheses pairing ({}[]()<>) issue

I want to be able to pair up all parentheses in a string, if they aren't paired then then they get their index number and False. It seems like it is repeating some values over and over, i.e cl == pop[1]. I have tried to see where the problem is but I can't see it no matter how hard I try. So I'm asking if anyone help me to locate the error and maybe even improve my code ;)
def check_parentheses(string):
pending = 0
brackets = []
'''Checks if parens are paired, otherwise they are bad.'''
parenstack = collections.deque()
for ch in string:
if ch in lrmap:
try:
cl = string.index(ch, pending)
pending = cl + 1
except:
cl = False
if ch in lparens:
parenstack.append([ch, cl])
print parenstack
elif ch in rparens:
try:
pop = parenstack.pop()
if lrmap[pop[0]] != ch:
print 'wrong type of parenthesis popped from stack',\
pop[0], ch, pop[1], cl
brackets.append([pop[1], False])
brackets.append([cl, False])
else:
brackets.append([pop[1], cl])
except IndexError:
print 'no opening parenthesis left in stack'
brackets.append([cl, False])
# if we are not out of opening parentheses, we have a mismatch
for p in parenstack:
brackets.append([p[1],False])
return brackets

You can adapt my code to a similar question:
def Evaluate(str):
stack = []
pushChars, popChars = "<({[", ">)}]"
for c in str :
if c in pushChars :
stack.append(c)
elif c in popChars :
if not len(stack) :
return False
else :
stackTop = stack.pop()
balancingBracket = pushChars[popChars.index(c)]
if stackTop != balancingBracket :
return False
else :
return False
return not len(stack)

iparens = iter('(){}[]<>')
parens = dict(zip(iparens, iparens))
closing = parens.values()
def balanced(astr):
stack = []
for c in astr:
d = parens.get(c, None)
if d:
stack.append(d)
elif c in closing:
if not stack or c != stack.pop():
return False
return not stack
Example:
>>> balanced('[1<2>(3)]')
True
>>> balanced('[1<2(>3)]')
False

BRACES = { '(': ')', '[': ']', '{': '}' }
def group_check(s):
stack = []
for b in s:
c = BRACES.get(b)
if c:
stack.append(c)
elif not stack or stack.pop() != b:
return False
return not stack

Thanks hughdbrown your code was a breeze to get working and it's really short! You've just saved me a headache :D
converted it to pep8 if thats ok :)
Edit
Added support for comments and strings, it will not match inside them.
Added support for easy language brace checking, modify the charset dict.
Correctly paires up, i.e right to left
HTML
charset = dict(opening='{[(<',\
closing='}])>',\
string = ('"', "'"),\
comment=(('<!--', '-->')))
Python
charset = dict(opening='{[(<',\
closing='}])>',\
string = ('"', "'"),\
comment=(("'''", "'''"), ('"""', '"""'), ('#', '\n')))
C++
charset = dict(opening='{[(<',\
closing='}])>',\
string = ('"', "'"),\
comment=(('/*', '*/'), ('//', '\n')))
you get the point? :)
charset = dict(opening='{[(<',\
closing='}])>',\
string = ('"', "'"),\
comment=(('<!--', '-->'), ('"""', '"""'), ('#', '\n')))
allowed = ''.join([x[0][0] + x[1][0] for x in charset['comment']])
allowed += ''.join(charset['string'])
allowed += charset['opening']
allowed += charset['closing']
def brace_check(text):
o = []
c = []
notr = []
found = []
busy = False
last_pos = None
for i in xrange(len(text)):
ch = text[i]
if not busy:
cont = True
for comment in charset['comment']:
if ch == comment[0][0]:
como = text[i:len(comment[0])]
if como == comment[0]:
busy = comment[1]
if ch in charset['opening']:
last_pos = i
cont = False
break
if cont:
if ch in charset['string']:
busy = ch
elif ch in charset['opening']:
o.append((ch, i))
elif ch in charset['closing']:
c.append((ch, i))
else:
if ch == busy[0]:
if len(busy) == 1:
comc = ch
else:
comc = text[i:i + len(busy)]
if comc == busy:
if last_pos is not None:
if busy[-1] in charset['closing']:
found.append((last_pos, i))
last_pos = None
text = text[:i] + '\n' * len(comc) +\
text[i + len(comc):]
busy = not busy
elif busy in charset['string']:
if ch == '\n':
busy = not busy
for t, e in reversed(o):
try:
n = next((b, v) for b, v in c\
if b == charset['closing'][\
charset['opening'].find(t)] and v > e)
c.remove(n)
n = n[1]
if found != []:
if e < found[-1][0] and n > found[-1][0] and n < found[-1][1]\
or e < found[-1][1] and n > found[-1][1] and e > found[-1][0]:
found.append((n, False))
n = False
except StopIteration:
n = False
found.append((e, n))
for t, e in c:
found.append((e, False))
return found

An understandable solution in Python 3:
def check_balanced_string(str):
stack = []
dicc = {'(': ')', '[': ']', '{': '}'}
for char in str:
if char in dicc.keys(): # opening char
stack.append(char)
elif char in dicc.values(): # closing char
if dicc[stack[-1]] == char: # check if closing char corresponds to last opening char
stack.pop()
else:
return False
return not len(stack) # returns True when len == 0
eq = '{1+[3*5+(2+1)]}'
print(check_balanced_string(eq))

Try this:
def matched(s):
stack=[]
open,close="(",")"
for i in s:
if i in open:
stack.append(i)
if i in close:
if len(stack)==0:
return(False)
else:
stack.pop()
if len(stack):
return(False)
else:
return(True)

The below code will display the missing parentheses and the no of times missing in the given string.
from collections import Counter
def find_missing(str):
stack1 = []
stack2 = []
result = []
res_dict = {}
open_set = '<[{('
closed_set = '>]})'
a = list(str)
for i in a:
if i in open_set:
stack1.append(i)
elif i in closed_set:
stack2.append(i)
dict1 = Counter(stack1)
dict2 = Counter(stack2)
print(dict1)
print(dict2)
for i in open_set:
if dict1[i] > dict2[closed_set[open_set.index(i)]]:
res_dict[closed_set[open_set.index(i)]] = dict1[i] - dict2[closed_set[open_set.index(i)]]
result.append(closed_set[open_set.index(i)])
for i in closed_set:
if dict2[i] > dict1[open_set[closed_set.index(i)]]:
res_dict[open_set[closed_set.index(i)]] = dict2[i] - dict1[open_set[closed_set.index(i)]]
result.append(open_set[closed_set.index(i)])
return res_dict
# return result
if __name__ == '__main__':
str1 = '{This ((()bracket {[function]} <<going> crazy}'
x = find_missing(str1)
if len(x) > 0:
print("Imbalanced")
print(x)
else:
print("Balanced")

First we will scan the string from left to right, and every time we see an opening parenthesis we push it to a stack, because we want the last opening parenthesis to be closed first. (Remember the FILO structure of a stack!)
Then, when we see a closing parenthesis we check whether the last opened one is the corresponding closing match, by popping an element from the stack. If it’s a valid match, then we proceed forward, if not return false.
Code:
https://gist.github.com/i143code/51962bfb1bd5925f75007d4dcbcf7f55

I needed something for a recent project and figured I could build on the OP's solution a bit. It allows for comment patterns, quotes and brackets to be checked, whilst ignoring the surrounding text. I've purposefully made it more generic than it needs to be so that others can take what they want and cut out what they don't.
"""
This module is for testing bracket pairings within a given string
Tested with Python 3.5.4
>>> regexp = getRegexFromList(opening + closing)
>>> print(regexp)
(\\<\\-\\-|\\-\\-\\>|\\/\\*|\\/\\/|\\*\\/|\\#|\\"|\\'|\\(|\\[|\\{|\\<|\\\n|\\\n|\\"|\\'|\\)|\\]|\\}|\\>)
>>> test_string = 'l<--([0])-->1/*{<2>}*/3//<--4 &-->\\n5#"6"\\n7"/*(8)*/"9\'"10"\'11({12\ta})13[<14>]'
>>> patterns = re.findall(regexp, test_string)
>>> print(patterns)
['<--', '(', '[', ']', ')', '-->', '/*', '{', '<', '>', '}', '*/', '//', '<--', '-->', '\\n', '#', '"', '"', '\\n', '"', '/*', '(', ')', '*/', '"', '(', '{', '}', ')', '[', '<', '>', ']']
>>> doBracketsMatch(patterns)
True
>>> doBracketsMatch(['"', ')', '"', '[', ']', '\\''])
False
"""
# Dependencies
import re
# Global Variables
# Provide opening and closing patterns, along with their priorities & whether a priority is nestable
opening = ['<--', '/*', '//', '#', '"', '\'', '(', '[', '{', '<']
closing = ['-->', '*/', '\n', '\n', '"', '\'', ')', ']', '}', '>']
priority = [ 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]
nestable = {0: True, 1: False}
bracket_pairs = dict(zip(opening + closing, \
[[(closing + opening)[i], (priority + priority)[i]] \
for i in range(0, opening.__len__() * 2)]))
def getRegexFromList(listOfPatterns):
"""
Generate the search term for the regular expression
:param listOfPatterns:
:return:
>>> getRegexFromList(['"', '<--', '##', 'test'])
'(\\\\t\\\\e\\\\s\\\\t|\\\\<\\\\-\\\\-|\\\\#\\\\#|\\\\")'
"""
# Longer patterns first to prevent false negatives
search_terms = sorted(listOfPatterns, key=len, reverse=True)
regex = ""
for term in search_terms:
for char in str(term):
regex = regex + '\\' + char # Search for all characters literally
regex = regex + '|' # Search pattern = (a|b|c)
return '(' + regex[:-1] + ')' # Remove excess '|' and add brackets
def doBracketsMatch(list_of_brackets):
"""
Determine if brackets match up
:param list_of_brackets:
:return:
"""
stack = []
for bracket in list_of_brackets:
# Check empty stack conditions
if stack.__len__() is 0:
# Check for openings first to catch quotes
if bracket in opening:
stack.append(bracket)
elif bracket in closing:
return False
else:
continue
# Check for a matching bracket
elif bracket == bracket_pairs[stack[-1]][0]:
stack.pop()
# Ignore cases:
# - False positives
# - Lower priority brackets
# - Equal priority brackets if nesting is not allowed
elif bracket not in bracket_pairs or \
bracket_pairs[bracket][1] < bracket_pairs[stack[-1]][1] or \
(bracket_pairs[bracket][1] == bracket_pairs[stack[-1]][1] and \
not nestable[bracket_pairs[bracket][1]]):
continue
# New open bracket
elif bracket in opening:
stack.append(bracket)
# Otherwise, unpaired close bracket
else:
return False
# If stack isn't empty, then there is an unpaired open bracket
return not bool(stack)
if __name__ == '__main__':
import doctest
doctest.testmod()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python Regex matching wrong strings - python

Related

What's wrong with recursive Regex function code in python

Balanced expression with replacement

regex to replace regex

Splitting C string in Python

Parentheses pairing ({}[]()<>) issue

Categories

Resources