I am given a string in the following format: "a{1;4:6}" and "a{1;2}b{2:4}" where the ; represents two different numbers, and a : represents a sequence of numbers. There can be any number of combinations of semicolons and colons within the brace.
I want to expand it such that these are the results of expanding the two examples above:
"a{1;4:6}" = "a1a4a5a6"
"a{1;2}b{2:4}" = "a1b2b3b4a2b2b3b4"
I've never had to deal with something like this before, since I am usually given strings in some sort of ready-made format which is easily parsable. In this case I have to parse the string manually.
My attempt is to split the string manually, over and over again, until you hit a case where there is either a colon or a semicolon, then start building the string from there. This is horribly inefficient, and I would appreciate any thoughts on this approach. Here is essentially what the code looks like (I omitted a lot of it, just to get the point across more quickly):
>>> s = "a{1;4:6}"
>>> splitted = s.split("}")
>>> splitted
['a{1;4:6', '']
>>> splitted2 = [s.split("{") for s in splitted]
>>> splitted2
[['a', '1;4:6'], ['']]
>>> splitted3 = [s.split(";") for s in splitted2[0]]
>>> splitted3
[['a'], ['1', '4:6']]
# ... etc, then build up the strings manually once the ranges are figured out.
The thinking behind splitting at the close brace at first is that it is guaranteed that a new identifier, with an associated range comes up after it. Where am I going wrong? My approach works for simple strings such as the first example, but it doesn't for the second example. Furthermore it is inefficient. I would be thankful for any input on this problem.
I tried pyparsing for that and IMHO it produced a pretty readable code (took pack_tokens from the previous answer).
from pyparsing import nums, Literal, Word, oneOf, Optional, OneOrMore, Group, delimitedList
from string import ascii_lowercase as letters
# transform a '123' to 123
number = Word(nums).setParseAction(lambda s, l, t: int(t[0]))
# parses 234:543 ranges
range_ = number + Literal(':').suppress() + number
# transforms the range x:y to a list [x, x+1, ..., y]
range_.setParseAction(lambda s, l, t: list(range(t[0], t[1]+1)))
# parse the comma delimited list of ranges or individual numbers
range_list = delimitedList(range_|number,",")
# and pack them in a tuple
range_list.setParseAction(lambda s, l, t: tuple(t))
# parses 'a{2,3,4:5}' group
group = Word(letters, max=1) + Literal('{').suppress() + range_list + Literal('}').suppress()
# transform the group parsed as ['a', [2, 4, 5]] to ['a2', 'a4' ...]
group.setParseAction(lambda s, l, t: tuple("%s%d" % (t[0],num) for num in t[1]))
# the full expression is just those group one after another
expression = OneOrMore(group)
def pack_tokens(s, l, tokens):
current, *rest = tokens
if not rest:
return ''.join(current) # base case
return ''.join(token + pack_tokens(s, l, rest) for token in current)
expression.setParseAction(pack_tokens)
parsed = expression.parseString('a{1,2,3}')[0]
print(parsed)
parsed = expression.parseString('a{1,3:7}b{1:5}')[0]
print(parsed)
import re
def expand(compressed):
# 'b{2:4}' -> 'b{2;3;4}' i.e. reduce the problem to just one syntax
normalized = re.sub(r'(\d+):(\d+)', lambda m: ';'.join(map(str, range(int(m.group(1)), int(m.group(2)) + 1))), compressed)
# 'a{1;2}b{2;3;4}' -> ['a{1;2}', 'b{2;3;4}']
elements = re.findall(r'[a-z]\{[\d;]+\}', normalized)
tokens = []
# ['a{1;2}', 'b{2;3;4}'] -> [['a1', 'a2'], ['b2', 'b3', 'b4']]
for element in elements:
match = re.match(r'([a-z])\{([\d;]+)\}', element)
alphanumerics = [] # match result already guaranteed by re.findall()
for number in match.group(2).split(';'):
alphanumerics.append(match.group(1) + number)
tokens.append(alphanumerics)
# [['a1', 'a2'], ['b2', 'b3', 'b4']] -> 'a1b2b3b4a2b2b3b4'
def pack_tokens(tokens):
current, *rest = tokens
if not rest:
return ''.join(current) # base case
return ''.join(token + pack_tokens(rest) for token in current)
return pack_tokens(tokens)
strings = ['a{1;4:6}', 'a{1;2}b{2:4}', 'a{1;2}b{2:4}c{3;6}']
for string in strings:
print(string, '->', expand(string))
OUTPUT
a{1;4:6} -> a1a4a5a6
a{1;2}b{2:4} -> a1b2b3b4a2b2b3b4
a{1;2}b{2:4}c{3;6} -> a1b2c3c6b3c3c6b4c3c6a2b2c3c6b3c3c6b4c3c6
Just to demonstrate a technique for doing this using eval (as #ialcuaz asked in the comments). Again I wouldn't recommend doing it this way, the other answers are more appropriate. This technique can be useful when the structure is more complex (i.e. recursive with brackets and so on) when you don't want a full blown parser.
import re
import functools
class Group(object):
def __init__(self, prefix, items):
self.groups = [[prefix + str(x) for x in items]]
def __add__(self, other):
self.groups.extend(other.groups)
return self
def __repr__(self):
return self.pack_tokens(self.groups)
# adapted for Python 2.7 from #cdlane's code
def pack_tokens(self, tokens):
current = tokens[:1][0]
rest = tokens[1:]
if not rest:
return ''.join(current)
return ''.join(token + self.pack_tokens(rest) for token in current)
def createGroup(str, *items):
return Group(str, items)
def expand(compressed):
# Replace a{...}b{...} with a{...} + b{...} as we will overload the '+' operator to help during the evaluation
expr = re.sub(r'(\}\w+\{)', lambda m: '} + ' + m.group(1)[1:-1] + '{', compressed)
# Expand : range to explicit list of items (from #cdlane's answer)
expr = re.sub(r'(\d+):(\d+)', lambda m: ';'.join(map(str, range(int(m.group(1)), int(m.group(2)) + 1))), expr)
# Convert a{x;y;..} to a(x,y, ...) so that it evaluates as a function
expr = expr.replace('{', '(').replace('}', ')').replace(";", ",")
# Extract the group prefixes ('a', 'b', ...)
groupPrefixes = re.findall(ur'(\w+)\([\d,]+\)', expr)
# Build a namespace mapping functions 'a', 'b', ... to createGroup() capturing the groupName prefix in the closure
ns = {prefix: functools.partial(createGroup, prefix) for prefix in groupPrefixes}
# Evaluate the expression using the namespace
return eval(expr, ns)
tests = ['a{1;4:6}', 'a{1;2}b{2:4}', 'a{1;2}b{2:4}c{3;6}']
for test in tests:
print(test, '->', expand(test))
Produces:
('a{1;4:6}', '->', a1a4a5a6)
('a{1;2}b{2:4}', '->', a1b2b3b4a2b2b3b4)
('a{1;2}b{2:4}c{3;6}', '->', a1b2c3c6b3c3c6b4c3c6a2b2c3c6b3c3c6b4c3c6)
Related
I have the following code replacing every element with it's short form in the lookup:
case = ["MY_FIRST_RODEO"]
lookup = {'MY': 'M', 'FIRST': 'FRST', 'RODEO' : 'RD', 'FIRST_RODEO': 'FRD', 'MY_FIRST': 'MF', 'MY_FIRST_RODEO': 'MFR'}
case_mod = []
for string in case:
words = string.split("_")
new_string = [lookup[word] for word in words]
case_mod.append("_".join(new_string))
print(case_mod)
This returns:
['M_FRST_RD']
However, I want it to additionally return all possibilities since in the lookup, I have short words for all MY_FIRST, FIRST_RODEO, and MY_FIRST_RODEO. So, I want the following returned:
['M_FRST_RD', 'MF_RD', 'M_FRD', 'MFR']
I was able to write code to break the original list into all possibilities as follows:
case = ["MY_FIRST_RODEO"]
result = []
for string in case:
words = string.split("_")
n = len(words)
for i in range(n):
result.append("_".join(words[:i + 1]))
for j in range(i + 1, n):
result.append("_".join(words[i:j + 1]))
result.extend(words)
result = list(dict.fromkeys(result))
print(result)
to return:
['MY', 'MY_FIRST', 'FIRST', 'RODEO', 'MY_FIRST_RODEO', 'FIRST_RODEO']
But somehow can't make the connection between the two solutions. Any help will be greatly appreciated.
One thing you could try is the following:
from itertools import combinations
string = "MY_FIRST_RODEO"
lookup = {'MY': 'M', 'FIRST': 'FRST', 'RODEO' : 'RD', 'FIRST_RODEO': 'FRD', 'MY_FIRST': 'MF', 'MY_FIRST_RODEO': 'MFR'}
underscores = [i for i, c in enumerate(string) if c == "_"]
length = len(string)
results = []
for r in range(len(underscores), -1, -1):
for parts in combinations(underscores, r):
limits = ((a + 1, b) for a, b in zip((-1,) + parts, parts + (length,)))
results.append("_".join(lookup[string[a:b]] for a, b in limits))
First record the indices of string with an underscore and then use them with combinations (from the standard library module itertools) to choose all the different partitions of string along the underscores. (I've left out the outer loop over case since that is not needed to show the proposed mechanic.)
Result here:
['M_FRST_RD', 'M_FRD', 'MF_RD', 'MFR']
My list:
l = ["volcano", "noway", "lease", "sequence", "erupt"]
Desired output:
'volcanowayleasequencerupt'
I have tried:
using itertools.groupby but it seems like it doesn't work well when there is 2 repeated letters in row (i.e. leasesequence -> sese stays):
>>> from itertools import groupby
>>> "".join([i[0] for i in groupby("".join(l))])
'volcanonowayleasesequencerupt'
As you can see it got rid only for the last 'e', and this is not ideal because if a letter has double characters they will be shrunk to 1. i.e 'suddenly' becomes 'sudenly'.
I'm looking for the most Pythonic approach for this.
Thank you in advance.
EDIT
My list does not have any duplicated items in it.
Using a helper function that crops a word t by removing its longest prefix that's also a suffix of s:
def crop(s, t):
for k in range(len(t), -1, -1):
if s.endswith(t[:k]):
return t[k:]
And then crop each word with its preceding word:
>>> l = ["volcano", "noway", "lease", "sequence", "erupt"]
>>> ''.join(crop(s, t) for s, t in zip([''] + l, l))
'volcanowayleasequencerupt'
>>> l = ['split', 'it', 'lit']
>>> ''.join(crop(s, t) for s, t in zip([''] + l, l))
'splitlit'
A more readable version, in my opinion:
from functools import reduce
def max_overlap(s1, s2):
return next(
i
for i in reversed(range(len(s2) + 1))
if s1.endswith(s2[:i])
)
def overlap(strs):
return reduce(
lambda s1, s2:
s1 + s2[max_overlap(s1, s2):],
strs, '',
)
overlap(l)
#> 'volcanowayleasequencerupt'
However, it also considers "accumulated" characters from previous words that overlapped:
overlap(['split', 'it', 'lit'])
#> 'split'
Here's a brute-force deduplicator:
def dedup(a, b):
for i in range(len(b), 0, -1):
if a[-i:] == b[:i]:
return a[:-i]
return a
Then, simply zip through:
>>> from itertools import chain, islice
>>> xs = ["volcano", "noway", "lease", "sequence", "erupt"]
>>> xs = [dedup(*x) for x in zip(xs, chain(islice(xs, 1, None), [""]))]
>>> "".join(xs)
'volcanowayleasequencerupt'
Naturally, this works for any length of list xs.
I have a dictionary containing the following key-value pairs: d={'Alice':'x','Bob':'y','Chloe':'z'}
I want to replace the lower case variables(values) by the constants(keys) in any given string.
For example, if my string is:
A(x)B(y)C(x,z)
how do I replace the characters in order to get a resultant string of :
A(Alice)B(Bob)C(Alice,Chloe)
Should I use regular expressions?
re.sub() solution with replacement function:
import re
d = {'Alice':'x','Bob':'y','Chloe':'z'}
flipped = dict(zip(d.values(), d.keys()))
s = 'A(x)B(y)C(x,z)'
result = re.sub(r'\([^()]+\)', lambda m: '({})'.format(','.join(flipped.get(k,'')
for k in m.group().strip('()').split(','))), s)
print(result)
The output:
A(Alice)B(Bob)C(Alice,Chloe)
Extended version:
import re
def repl(m):
val = m.group().strip('()')
d = {'Alice':'x','Bob':'y','Chloe':'z'}
flipped = dict(zip(d.values(), d.keys()))
if ',' in val:
return '({})'.format(','.join(flipped.get(k,'') for k in val.split(',')))
else:
return '({})'.format(flipped.get(val,''))
s = 'A(x)B(y)C(x,z)'
result = re.sub(r'\([^()]+\)', repl, s)
print(result)
Bonus approach for particular input case A(x)B(y)C(Alice,z):
...
s = 'A(x)B(y)C(Alice,z)'
result = re.sub(r'\([^()]+\)', lambda m: '({})'.format(','.join(flipped.get(k,'') or k
for k in m.group().strip('()').split(','))), s)
print(result)
I assume you want to replace the values in a string with the respective keys of the dictionary. If my assumption is correct you can try this without using regex.
First the swap the keys and values using dictionary comprehension.
my_dict = {'Alice':'x','Bob':'y','Chloe':'z'}
my_dict = { y:x for x,y in my_dict.iteritems()}
Then using list_comprehension, you replace the values
str_ = 'A(x)B(y)C(x,z)'
output = ''.join([i if i not in my_dict.keys() else my_dict[i] for i in str_])
Hope this is what you need ;)
Code
import re
d={'Alice':'x','Bob':'y','Chloe':'z'}
keys = d.keys()
values = d.values()
s = "A(x)B(y)C(x,z)"
for i in range(0, len(d.keys())):
rx = r"" + re.escape(values[i])
s = re.sub(rx, keys[i], s)
print s
Output
A(Alice)B(Bob)C(Alice,Chloe)
Also you could use the replace method in python like this:
d={'x':'Alice','y':'Bob','z':'Chloe'}
str = "A(x)B(y)C(x,z)"
for key in d:
str = str.replace(key,d[key])
print (str)
But yeah you should swipe your dictionary values like Kishore suggested.
This is the way that I would do it:
import re
def sub_args(text, tosub):
ops = '|'.join(tosub.keys())
for argstr, _ in re.findall(r'(\(([%s]+?,?)+\))' % ops, text):
args = argstr[1:-1].split(',')
args = [tosub[a] for a in args]
subbed = '(%s)' % ','.join(map(str, args))
text = re.sub(re.escape(argstr), subbed, text)
return text
text = 'A(x)B(y)C(x,z)'
tosub = {
'x': 'Alice',
'y': 'Bob',
'z': 'Chloe'
}
print(sub_args(text, tosub))
Basically you just use the regex pattern to find all of the argument groups and substitute in the proper values--the nice thing about this approach is that you don't have to worry about subbing where you don't want to (for example, if you had a string like 'Fn(F,n)'). You can also have multi-character keys, like 'F(arg1,arg2)'.
I am given an expression using parantheses and +'s, such as (((a+b)+c)+(d+e)).
I need to find the parse tree of this, and then print the list form of this parse tree like:
[ [ [a, b], c ], [d, e] ]
I was thinking I'd use something like ast, then ast2list. However, due to my not fully understanding these, I am repeatedly getting syntax errors. This is what I have:
import ast
import parser
a = ast.parse("(((a+b)+c)+(d+e))", mode='eval')
b = parser.ast2list(a)
print(b)
Could anyone guide me in the right direction? Thanks.
Colleen's comment can be realized with something like:
str = "(((a+b)+c)+(d+e))"
replacements = [
('(','['),
(')',']'),
('+',','),
# If a,b,c,d,e are defined variables, you don't need the following 5 lines
('a',"'a'"),
('b',"'b'"),
('c',"'c'"),
('d',"'d'"),
('e',"'e'"),
]
for (f,s) in replacements:
str = str.replace(f,s)
obj = eval(str)
print(str) # [[['a','b'],'c'],['d','e']]
print(obj) # [[['a', 'b'], 'c'], ['d', 'e']]
# You can access the parsed elements as you would any iterable:
print(obj[0]) # [['a', 'b'], 'c']
print(obj[1]) # ['d', 'e']
print(obj[1][0]) # d
If you really want to do a parser, start by not writing any code, but by understanding how your grammar should work. Backus-Naur Format or BNF is the typical notation used to define your grammar. Infix notation is a common software engineering parsing topic, and the basic BNF structure for infix notation goes like:
letter ::= 'a'..'z'
operand ::= letter+
term ::= operand | '(' expr ')'
expr ::= term ( '+' term )*
The key is that term contains either your alphabetic operand or an entire subexpression wrapped in ()'s. That subexpression is just the same as the overall expression, so this recursive definition takes care of all the parenthesis nesting. The expression then is a term followed by zero or more terms, added on using your binary '+' operator. (You could expand term to handle subtraction and multiplication/division as well, but I'm not going to complicate this answer more than necessary.)
Pyparsing is a package that makes it easy to translate a BNF to a working parser using Python objects (Ply, spark, and yapps are other parsers, which follow the more traditional lex/yacc model of parser creation). Here is that BNF implemented directly using pyparsing:
from pyparsing import Suppress, Word, alphas, Forward, Group, ZeroOrMore
LPAR, RPAR, PLUS = map(Suppress, "()+")
operand = Word(alphas)
# forward declare our overall expression, necessary when defining a recursive grammar
expr = Forward()
# each term is either an alpha operand, or an expr in ()'s
term = operand | Group(LPAR + expr + RPAR)
# define expr as a term, with optional '+ term's
expr << term + ZeroOrMore(PLUS + term)
# try it out
s = "(((a+b)+c)+(d+e))"
print expr.parseString(s)
giving:
[[[['a', 'b'], 'c'], ['d', 'e']]]
Infix notation with recognition of precedence of operations is a pretty common parser, or part of a larger parser, so pyparsing includes a helper builtin call operatorPrecedence to take care of all the nesting/grouping/recursion, etc. Here is that same parser written using operatorPrecedence:
from pyparsing import operatorPrecedence, opAssoc, Word, alphas, Suppress
# define an infix notation with precedence of operations
# you only define one operation '+', so this is a simple case
operand = Word(alphas)
expr = operatorPrecedence(operand,
[
('+', 2, opAssoc.LEFT),
])
print expr.parseString(s)
giving the same results as before.
More detailed examples can be found online at the pyparsing wiki - the explicit implementation at fourFn.py and the operatorPrecedence implementation at simpleArith.py.
Look at the docs for the ast module here where the NodeVisitor class is described.
import ast
import sys
class MyNodeVisitor(ast.NodeVisitor):
op_dict = {
ast.Add : '+',
ast.Sub : '-',
ast.Mult : '*',
}
type_dict = {
ast.BinOp: lambda s, n: s.handleBinOp(n),
ast.Name: lambda s, n: getattr(n, 'id'),
ast.Num: lambda s, n: getattr(n, 'n'),
}
def __init__(self, *args, **kwargs):
ast.NodeVisitor.__init__(self, *args, **kwargs)
self.ast = []
def handleBinOp(self, node):
return (self.op_dict[type(node.op)], self.handleNode(node.left),
self.handleNode(node.right))
def handleNode(self, node):
value = self.type_dict.get(type(node), None)
return value(self, node)
def visit_BinOp(self, node):
op = self.handleBinOp(node)
self.ast.append(op)
def visit_Name(self, node):
self.ast.append(node.id)
def visit_Num(self, node):
self.ast.append(node.n)
def currentTree(self):
return reversed(self.ast)
a = ast.parse(sys.argv[1])
visitor = MyNodeVisitor()
visitor.visit(a)
print list(visitor.currentTree())
Looks like this:
$ ./ast_tree.py "5 + (1 + 2) * 3"
[('+', 5, ('*', ('+', 1, 2), 3))]
Enjoy.
This is a simple enough problem that you could just write a solution from scratch. This assumes that all variable names are one-character long, or that the expression has been correctly converted into a list of tokens. I threw in checks to make sure all parenthesis are matched; obviously you should swap out CustomError for whatever exception you want to throw or other action you want to take.
def expr_to_list(ex):
tree = []
stack = [tree]
for c in ex:
if c == '(':
new_node = []
stack[-1].append(new_node)
stack.append(new_node)
elif c == '+' or c == ' ':
continue
elif c == ')':
if stack[-1] == tree:
raise CustomError('Unmatched Parenthesis')
stack.pop()
else:
stack[-1].append(c)
if stack[-1] != tree:
raise CustomError('Unmatched Parenthesis')
return tree
Tested:
>>> expr_to_list('a + (b + c + (x + (y + z) + (d + e)))')
['a', ['b', 'c', ['x', ['y', 'z'], ['d', 'e']]]]
And for multi-character variable names, using a regex for tokenization:
>>> tokens = re.findall('\(|\)|\+|[\w]+',
'(apple + orange + (banana + grapefruit))')
>>> tokens
['(', 'apple', '+', 'orange', '+', '(', 'banana', '+', 'grapefruit', ')', ')']
>>> expr_to_list(tokens)
[['apple', 'orange', ['banana', 'grapefruit']]]
I would make a translater too. Doing it via ast was bit cumbersome to implement for this purpose.
[tw-172-25-24-198 ~]$ cat a1.py
import re
def multiple_replace(text, adict):
rx = re.compile('|'.join(map(re.escape, adict)))
def one_xlat(match):
return adict[match.group(0)]
return rx.sub(one_xlat, text)
# Closure based approach
def make_xlat(*args, **kwds):
adict = dict(*args, **kwds)
rx = re.compile('|'.join(map(re.escape, adict)))
def one_xlat(match):
return adict[match.group(0)]
def xlat(text):
return rx.sub(one_xlat, text)
return xlat
if __name__ == "__main__":
text = "((a+b)+c+(d+(e+f)))"
adict = {
"+":",",
"(":"[",
")":"]",
}
translate = make_xlat(adict)
print translate(text)
Should give
[[a,b],c,[d,[e,f]]]
Note - I have been having this snippet in my collections. It is from Python Cookbook. It does multiple replacements on the string, with the replacement key and values in the dictionary in a single pass.
I am trying to parse complex logical expression like the one below;
x > 7 AND x < 8 OR x = 4
and get the parsed string as a binary tree. For the above expression the expected parsed expression should look like
[['x', '>', 7], 'AND', [['x', '<', 8], 'OR', ['x', '=', 4]]]
'OR' logical operator has higher precedence than 'AND' operator. Parenthesis can override the default precedence. To be more general, the parsed expression should look like;
<left_expr> <logical_operator> <right_expr>
Another example would be
input_string = x > 7 AND x < 8 AND x = 4
parsed_expr = [[['x', '>', 7], 'AND', ['x', ',', 8]], 'AND', ['x', '=', 4]]
So far i came up with this simple solution which sadly cannot generate parsed expression in binary tree fashion. operatorPrecedence doesn't seem to have help me here where there is same logical operator consecutively as in previous example.
import pyparsing as pp
complex_expr = pp.Forward()
operator = pp.Regex(">=|<=|!=|>|<|=").setName("operator")
logical = (pp.Keyword("AND") | pp.Keyword("OR")).setName("logical")
vars = pp.Word(pp.alphas, pp.alphanums + "_") | pp.Regex(r"[+-]?\d+(:?\.\d*)?(:?[eE][+-]?\d+)?")
condition = (vars + operator + vars)
clause = pp.Group(condition ^ (pp.Suppress("(") + complex_expr + pp.Suppress(")") ))
expr = pp.operatorPrecedence(clause,[
("OR", 2, pp.opAssoc.LEFT, ),
("AND", 2, pp.opAssoc.LEFT, ),])
complex_expr << expr
print complex_expr.parseString("x > 7 AND x < 8 AND x = 4")
Any suggestions or guidance is well appreciated.
BNF for the expression (without parenthesis) could be
<expr> -> <expr> | <expr> <logical> <expr>
<expr> -> <opnd> <relational> <opnd>
<opnd> -> <variable> | <numeric>
<relational> -> <'>'> | <'='> | <'>='> | <'<='> | <'!='>
NOTE: the operatorPrecedence method of pyparsing is deprecated in favor of
the method name infixNotation.
Try changing:
expr = pp.operatorPrecedence(clause,[
("OR", 2, pp.opAssoc.LEFT, ),
("AND", 2, pp.opAssoc.LEFT, ),])
to:
expr = pp.operatorPrecedence(condition,[
("OR", 2, pp.opAssoc.LEFT, ),
("AND", 2, pp.opAssoc.LEFT, ),])
The first argument to operatorPrecedence is the primitive operand to be used with the operators - there is no need to include your complexExpr in parentheses - operatorPrecedence will do that for you. Since your operand is actually another deeper comparison, you might consider changing:
condition = (expr + operator + expr)
to:
condition = pp.Group(expr + operator + expr)
so that the output of operatorPrecedence is easier to process. With these changes, parsing x > 7 AND x < 8 OR x = 4 gives:
[[['x', '>', '7'], 'AND', [['x', '<', '8'], 'OR', ['x', '=', '4']]]]
which recognizes OR's higher precedence and groups it first. (Are you sure you want this order of AND and OR precedence? I think the traditional ordering is the reverse, as shown in this wikipedia entry.)
I think you are also asking why pyparsing and operatorPrecedence does not return the results in nested binary pairs, that is, you expect parsing "A and B and C" would return:
[['A', 'and', 'B'] 'and', 'C']
but what you get is:
['A', 'and', 'B', 'and', 'C']
That is because operatorPrecedence parses repeated operations at the same precedence level using repetition, not recursion. See this question which is very similar to yours, and whose answer includes a parse action to convert your repetitive parse tree to the more traditional binary parse tree. You can also find a sample boolean expression parser implemented using operatorPrecedence on the pyparsing wiki page.
EDIT:
To clarify, this is what I recommend you reduce your parser to:
import pyparsing as pp
operator = pp.Regex(">=|<=|!=|>|<|=").setName("operator")
number = pp.Regex(r"[+-]?\d+(:?\.\d*)?(:?[eE][+-]?\d+)?")
identifier = pp.Word(pp.alphas, pp.alphanums + "_")
comparison_term = identifier | number
condition = pp.Group(comparison_term + operator + comparison_term)
expr = pp.operatorPrecedence(condition,[
("AND", 2, pp.opAssoc.LEFT, ),
("OR", 2, pp.opAssoc.LEFT, ),
])
print expr.parseString("x > 7 AND x < 8 OR x = 4")
If support for NOT might also be something you want to add, then this would look like:
expr = pp.operatorPrecedence(condition,[
("NOT", 1, pp.opAssoc.RIGHT, ),
("AND", 2, pp.opAssoc.LEFT, ),
("OR", 2, pp.opAssoc.LEFT, ),
])
At some point, you may want to expand the definition of comparison_term with a more complete arithmetic expression, defined with its own operatorPrecedence definition. I would suggest doing it this way, rather than creating one monster opPrec definition, as you have already alluded to some of the performance downsides to opPrec. If you still get performance issues, look into ParserElement.enablePackrat.
Let me suggest this parsing approach, coming directly from Peter Norvig's class in design of computer programs at udacity (and tweaked for your needs).
from functools import update_wrapper
from string import split
import re
def grammar(description, whitespace=r'\s*'):
"""Convert a description to a grammar. Each line is a rule for a
non-terminal symbol; it looks like this:
Symbol => A1 A2 ... | B1 B2 ... | C1 C2 ...
where the right-hand side is one or more alternatives, separated by
the '|' sign. Each alternative is a sequence of atoms, separated by
spaces. An atom is either a symbol on some left-hand side, or it is
a regular expression that will be passed to re.match to match a token.
Notation for *, +, or ? not allowed in a rule alternative (but ok
within a token). Use '\' to continue long lines. You must include spaces
or tabs around '=>' and '|'. That's within the grammar description itself.
The grammar that gets defined allows whitespace between tokens by default;
specify '' as the second argument to grammar() to disallow this (or supply
any regular expression to describe allowable whitespace between tokens)."""
G = {' ': whitespace}
description = description.replace('\t', ' ') # no tabs!
for line in split(description, '\n'):
lhs, rhs = split(line, ' => ', 1)
alternatives = split(rhs, ' | ')
G[lhs] = tuple(map(split, alternatives))
return G
def decorator(d):
def _d(fn):
return update_wrapper(d(fn), fn)
update_wrapper(_d, d)
return _d
#decorator
def memo(f):
cache = {}
def _f(*args):
try:
return cache[args]
except KeyError:
cache[args] = result = f(*args)
return result
except TypeError:
# some element of args can't be a dict key
return f(args)
return _f
def parse(start_symbol, text, grammar):
"""Example call: parse('Exp', '3*x + b', G).
Returns a (tree, remainder) pair. If remainder is '', it parsed the whole
string. Failure iff remainder is None. This is a deterministic PEG parser,
so rule order (left-to-right) matters. Do 'E => T op E | T', putting the
longest parse first; don't do 'E => T | T op E'
Also, no left recursion allowed: don't do 'E => E op T'"""
tokenizer = grammar[' '] + '(%s)'
def parse_sequence(sequence, text):
result = []
for atom in sequence:
tree, text = parse_atom(atom, text)
if text is None: return Fail
result.append(tree)
return result, text
#memo
def parse_atom(atom, text):
if atom in grammar: # Non-Terminal: tuple of alternatives
for alternative in grammar[atom]:
tree, rem = parse_sequence(alternative, text)
if rem is not None: return [atom]+tree, rem
return Fail
else: # Terminal: match characters against start of text
m = re.match(tokenizer % atom, text)
return Fail if (not m) else (m.group(1), text[m.end():])
# Body of parse:
return parse_atom(start_symbol, text)
Fail = (None, None)
MyLang = grammar("""expression => block logicalop expression | block
block => variable operator number
variable => [a-z]+
operator => <=|>=|>|<|=
number => [-+]?[0-9]+
logicalop => AND|OR""", whitespace='\s*')
def parse_it(text):
return parse('expression', text, MyLang)
print parse_it("x > 7 AND x < 8 AND x = 4")
Outputs:
(['expression', ['block', ['variable', 'x'], ['operator', '>'], ['number', '7']], ['logicalop', 'AND'], ['expression', ['block', ['variable', 'x'], ['operator', '<'], ['number', '8']], ['logicalop', 'AND'], ['expression', ['block', ['variable', 'x'], ['operator', '='], ['number', '4']]]]], '')