I'm trying to parse a simple template language with pyparsing which looks kind of like this:
if <expr>: <statements>
elif <expr>:
if <expr>:
<statements>
else:
<statements>
<statements>
In which statements are either if-blocks with being valid Python code or any text otherwise. If a string starts with an "if" keyword, that means it's an if-block right away. In fact, lines of text can also contain interpolations of the type {variable}, but that's about it.
A full valid example of the this syntax:
if a == b: do some {interpolation} stuff
elif a.x > b.y:
if True:
some {more} interpolation
and some more text
else: foo {bar}
arbitrary {text} here
lorem {ipsum}
To detect Python expressions, I've subclassed pyparsing.Token and it sort of seems to work:
import ast
from pyparsing import *
class PythonExpression(Token):
name = 'PythonExpression'
def parseImpl(self, s, loc, doActions=True):
max_loc = s.find('\n') if '\n' in s else len(s)
best_loc = None
for n in range(loc + 1, max_loc + 1):
try:
tree = ast.parse(s[loc:n])
except:
continue
if isinstance(tree, ast.Module):
if len(tree.body) is 1:
if isinstance(tree.body[0], ast.Expr):
best_loc = n
if best_loc is not None:
return best_loc, s[loc:best_loc]
raise ParseException(s, loc, 'invalid Python expression')
expr = 'if foo[1 : "bar:baz"] == 1 : passqwe'
print (Keyword('if') + PythonExpression()).parseString(expr).asList()
results in
['if', 'foo[1 : "bar:baz"] == 1 ']
However, I'm a bit lost with the use of pyparsing.indentedBlock, cannot seem to make it parse the entire grammar no matter what. My last attempt is this (note that it only contains the if statement implementation; there are also optional elif and else blocks):
ParserElement.setDefaultWhitespaceChars(' \t')
colon = Literal(':').suppress()
if_clause = PythonExpression() + colon
if_statement = Group(Keyword('if') + if_clause)
non_white = Regex(r'\S+')
anything = Combine(non_white + restOfLine)
statement = Forward()
indent_stack = [1]
if_block = Group(if_statement + ((anything) | indentedBlock(statement, indent_stack)))
other = ~Keyword('if') + anything
statement << (if_block | other)
parser = OneOrMore(indentedBlock(statement, indent_stack, False))
data = """\
if foo[1:2]:
bar
baz
if foo[3]:
bar : baz
if foo[4]: bar
baz
foo
"""
pprint.pprint(parser.parseString(data).asList())
Which starts parsing correctly but then stops:
[[[[['if', 'foo[1:2]'], [['bar'], ['baz']]]]]]
I've also tried explicitly adding + lineEnd.suppress() to anything but that didn't seem to help. I'm sure I'm doing something stupid here, probably related to newlines, but can't really figure it out.
On a side note, how do I detect interpolation patterns in anything in the example above (if the example above was to work), so that foo {bar} baz gets parsed to a ['foo', Var('bar'), 'baz']? It's easy to detect {var}, but what's the correct expression for plain text then that's not greedy enough to consume everything thrown at it and that doesn't mess up the if/elif/else logic (I tried using SkipTo but that became quite cumbersome)?
EDIT: adding the separate grammar for parsing the interpolation with an example:
class Substitution(object):
def __init__(self, s, l, t):
self.name = t[0]
def __repr__(self):
return 'Substitution(%r)' % self.name
ParserElement.setDefaultWhitespaceChars(' \t')
lbrace = Literal('{').suppress()
rbrace = Literal('}').suppress()
name = Word(alphas, alphanums + '_')
substitution = Combine(lbrace + name + rbrace).setParseAction(Substitution)
text = SkipTo(substitution | lineEnd.suppress(), include=True).leaveWhitespace()
parser = OneOrMore(text | substitution)
parser.parseString('hello \n {world} {invalid.sub} \n foo {bar} baz ').asList()
outputs
['hello ',
[' ', Substitution('world')],
' {invalid.sub} ',
[' foo ', Substitution('bar')],
' baz ']
As you can see, in this case the parser is not being quite correct at grouping lines together (it doesn't allow text to follow substitution somewhy), but it shows the point. These Substitution objects are then to be later processed at runtime.
Related
I want to parse some C files for functions and a specific command.
My aim is to get all functions that use the specific command as well as all times the command gets called.
Thus I decided to generate extra token for that by using multiple conditions.
Here is my lexer and parser code:
import os
import ply.lex as lex
import ply.yacc as yacc
results = []
calls = []
# Declare the state
states = (
('func', 'exclusive'),
('parameter', 'exclusive'),
('httpgettext', 'exclusive')
)
reserved = {
'void': 'VOID',
'int': 'INT',
'uint8': 'UINT8',
'uint16': 'UINT16',
'uint32': 'UINT32',
'TRet': 'TRET',
'TBool': 'TBOOL',
'bool': 'BOOL',
}
tokens = [
'ID',
'FUNC',
'PARAMETERLIST',
'CALL',
'SEMICOLON'
] + list(reserved.values())
# Start of token description for INITIAL mode (inclusive)
def t_ID(t):
r'[a-zA-Z_][a-zA-Z_0-9]*'
t.type = reserved.get(t.value, 'ID')
return t
# Start of token description for HttpGetText condition
def t_httpgettext_SEMICOLON(t):
r'\;'
t.value = t.lexer.lexdata[t.lexer.call_start:t.lexer.lexpos-1]
t.type = 'CALL'
t.lexer.pop_state()
global calls
arguments = str(t.value).split(',')
calls.append([arguments[1], arguments[2]])
# Start of token description for parameter list condition
def t_parameter(t):
r'\('
t.lexer.parameter_start = t.lexer.lexpos
t.lexer.paren_level = 1
t.lexer.push_state('parameter')
def t_parameter_lparen(t):
r'\('
t.lexer.paren_level += 1
def t_parameter_rparen(t):
r'\)'
t.lexer.paren_level -= 1
if t.lexer.paren_level == 0:
t.value = t.lexer.lexdata[t.lexer.parameter_start:t.lexer.lexpos - 1]
t.type = 'PARAMETERLIST'
t.lexer.pop_state()
return t
# Start of token description for function block condition
def t_func(t):
r'\{'
t.lexer.code_start = t.lexer.lexpos # Record the starting position
t.lexer.brace_level = 1 # Initial brace level
t.lexer.push_state('func') # Enter 'ccode' state
# Rules for the ccode state
def t_func_lbrace(t):
r'\{'
t.lexer.brace_level += 1
def t_func_rbrace(t):
r'\}'
t.lexer.brace_level -= 1
if t.lexer.brace_level == 0:
t.value = t.lexer.lexdata[t.lexer.code_start:t.lexer.lexpos - 1]
t.type = "FUNC"
t.lexer.lineno += t.value.count('\n')
t.lexer.pop_state()
return t
# Start of token description valid for all conditions
t_ANY_ignore = " \t§$%&+#-_:.<<|',\0"
def t_ANY_HttpGetText(t):
r'HttpGetText'
t.lexer.call_start = t.lexer.lexpos
t.lexer.push_state('httpgettext')
# For bad characters, we just skip over it
def t_ANY_error(t):
t.lexer.skip(1)
def t_ANY_comment(t):
r'(/\*(.|\n)*?\*/)|(//.*)'
pass
def t_ANY_ignore_comments(t):
r'//.*'
pass
def t_ANY_newline(t):
r'\n+'
t.lexer.lineno += len(t.value)
lexer = lex.lex()
def p_statement_function(p):
'statement : type identifier parameter function'
p[0] = p[2]
global results
identifier = str(p[2])
parameter_list = str(p[3]).replace('\n', '')
function_block = str(p[4])
if function_block.find('HttpGetText') != -1:
results.append([identifier, parameter_list, function_block])
print(identifier)
# while True:
# tok = parser.token()
# print(tok)
# if not tok:
# break
def p_parameter_PARAMETERLIST(p):
'parameter : PARAMETERLIST'
p[0] = p[1]
def p_function_FUNC(p):
'function : FUNC'
p[0] = p[1]
def p_identifier_ID(p):
'identifier : ID '
p[0] = p[1]
def p_type_TBOOL(p):
'type : TBOOL'
p[0] = p[1]
def p_type_VOID(p):
'type : VOID'
p[0] = p[1]
def p_type_TRET(p):
'type : TRET'
p[0] = p[1]
def p_type_BOOL(p):
'type : BOOL'
def p_type_INT(p):
'type : INTEGER'
p[0] = p[1]
def p_INTEGER_INT(p):
'INTEGER : INT'
p[0] = p[1]
def p_INTEGER_UINT8(p):
'INTEGER : UINT8'
p[0] = p[1]
def p_INTEGER_UINT16(p):
'INTEGER : UINT16'
p[0] = p[1]
def p_INTEGER_UINT32(p):
'INTEGER : UINT32'
p[0] = p[1]
def p_error(p):
print('Syntax error in input: ', p)
parser.restart()
parser = yacc.yacc()
with open('C:/Users/z0046abb/Desktop/Bachelorarbeit/TextLibraryAnalysis/test_file.txt', 'r') as f:
read_data = f.read()
parser.parse(read_data)
print(results)
print(calls)
This is the content of my test_file.txt:
int main(argv)
{
HttpGetText(Arg1, Arg2, Arg3, Arg4);
return 0
}
void func2(bla, bla, bla)
{
something = random();
HttpGetText(1,2,3,4);
}
void func3(bla, bla, bla)
{
something = random();
HttpGetText(1,21,31,4);
}
void func4(bla, bla, bla)
{
HttpGetText(1, 22, 32, 4);
}
void func5(bla, bla, bla)
{
something();
}
void func6(bla)
{
HttpGetText(1, 23, 33, 4);
}
HttpGetText(1, 24, 34, 4);
HtppGetText(1, 25, 35, 4);
But somehow not all matches are found/processed.
This is the output of a test run:
main
Syntax error in input: LexToken(VOID,'void',12,75)
func3
Syntax error in input: LexToken(VOID,'void',30,243)
Syntax error in input: LexToken(VOID,'void',44,353)
[['main', 'argv', '\n HttpGetText(Arg1, Arg2, Arg3, Arg4);\n\n return 0\n'], ['func3', 'bla, bla, bla', '\n something = random();\n HttpGetText(1,21,31,4);\n']]
[[' Arg2', ' Arg3'], ['2', '3'], ['21', '31'], [' 22', ' 32'], [' 23', ' 33']]
As you can see there is a error at void despite it being a reserved token.
I am not sure if the problem is in my lexer or parser implementation.
If I use the 'lookahead' functionality (part of the function that is a comment) from p_statement_function(p): it seems like all token are correctly labeled.
However the above output only seems to identify main() and func3().
Additionally the last two lines of test_file.txt should be appended as well.
My first idea was to switch from t.lexer.begin(state) to t.lexer.push_state(state) so I could return to the last state the lexer had would help here but it doesn't seem so.
Now I ran out of ideas. It doesn't seem to fail because of the global lists I use to store results (I am aware that global vars are a bit risky).
In addition I am suprised by the fact that main() and func3() are found to be fitting matches other than the rest of implemented functions.
I would be happy if anyone of you has an idea for me.
Edit:
I tried to modify test_file.txt. If there is some nonsense word between every function I can record all functions in my global result list. Though this isn't the solution I wish for.
The immediate problem you have is that the starting point you have given your parser is statement. As it name suggests, that non-terminal matches a single statement (actually, a single function definition). It does not match anything more, so once the statement is complete, the parser expects to see the end of input. Any token will therefore be a syntax error. (Some of these error messages are suppressed by Ply's error recovery mechanism.)
To fix that, you need to add a start non-terminal which recognises a sequence of statements:
program : | program statement
The other relevant error is in your lexer. Since t_ID precedes t_ANY_HttpGetText, it takes priority. That means that in INITIAL state, the HttpGetText token is recognised as an ID something which should have been visible when you tested the scanner. I don't think this is necessarily serious since top-level function calls are illegal in C (even in initialisers of global variables). But you can easily fix it by reordering those two lexer rules.
In an answer to a previous question of yours which I think concerns the same project, I said:
Note that trying to do all this work in the lexer is not generally recommended. Lexers should normally return simple atomic tokens, leaving it to the parser's grammar to do the work of putting the tokens together into a useful structure.
I regret not having made this warning stronger. Although it may seem like a lot of work to correctly tokenise C programs, it's actually not that difficult, and there are lots of examples floating around. A full parse is complicated, but most of the complications have to do with declarations and simplifications are possible if you don't need all that information.
Alternatively, complete open source C parsing solutions exist. There's a certain learning curve associated with using them, but the payoff is the flexibility to analyse program structure systematically without having to immerse yourself in the quirks of C syntax. (These aspects are much more acute if the code you are analysing is C++.)
I am trying to make a simple programme that can help make army lists for a popular tabletop wargame. More as an excercise for my own experience as there are plenty of pre made software packages that do this, but the idea behind it seems fairly straightforward
The programme reads the data for all the units available in an army from a spreadsheet and creates various classes for each unit. The main bit I am looking at now is the options/ upgrades.
In the file I want a straightforward syntax for the option field for each unit. i.e. the following options string itemA, itemB/itemC-3, 2*itemD, itemE/itemF/itemG, itemH/itemI+itemJ would mean
1. you may take itemA (X pts per model)
2. for every 3 models, you may exchange itemB with
a) itemC (net X pts per model)
3. each model may take 2 of itemD (X pts per model)
4. each model may take one of either
a)itemE (X pts per model)
b)itemF (X pts per model)
c)itemG (X pts per model
5. each model may take either
a)itemH (X points per model)
b)itemI and itemJ (X points per model)
At the moment I am processing the string using lots of splits and if statements, that make it very hard to keep track of and assign correctly once the user input their choice.
for index, option in enumerate(self.options):
output = "{}.".format(index+1)
if '-' in option:
sub_option, no_models = option.split('-')
no_models = int(no_models)
print(sub_option)
print(no_models)
output += "For every {} models ".format(no_models)
if '/' in sub_option:
temp_str, temp_options, points_list = exchange_option(sub_option)
else:
temp_str, temp_options, points_list = standard_option(sub_option)
index_points.append(points_list)
temp_options.append(no_models)
index_options.append(temp_options)
else:
if '/' in option:
temp_str, temp_options, points_list = exchange_option(option)
else:
temp_str, temp_options, points_list = standard_option(option)
index_points.append(points_list)
index_options.append(temp_options)
output += temp_str
the *_option() functions are additional helper functions I have defined above which have a similar structure with further if statements within them.
The main question I am asking, is there an easier way to process a code like string such as this? While it works to produce the output in the example above it seems awfully cumbersome to then deal with the user input.
What I am aiming to do is first output the string as given in my example at the top of the question, and then taking the user input index of the given option, modify the associated unit class to have the correct wargear and points value.
I thought about trying to make some kind of options class, but again labelling and defining each option so that they can interact with one another properly seems equally complex, and I feel there must be something more pythonic or just generally better coding practice to processing encoded strings such as this?
So, here's a full blown parser to do that! Now, this only outputs the list as in the previous version of your question, but it shouldn't be too hard to add more features as you want. Also please note that at the moment, the lexer does not error out when a string contains invalid tokens, but that's just a proof-of-concept, so it should be fine.
Part I: the lexer
This tokenises the input string - looks through it from left to right and attempts to classify non-overlapping substrings as instances of tokens. It's to be used before parsing. When given a string, Lexer.tokenize yields a stream of Tokens.
# FILE: lex.py
import re
import enum
class Token:
def __init__(self, type, value: str, lineno: int, pos: int):
self.type, self.value, self.lineno, self.pos = type, value, lineno, pos
def __str__(self):
v = f'({self.value!r})' if self.value else ''
return f'{self.type.name}{v} at {self.lineno}:{self.pos}'
__repr__ = __str__
class Lexer:
def __init__(self, token_types: enum.Enum, tokens_regexes: dict):
self.token_types = token_types
regex = '|'.join(map('(?P<{}>{})'.format, *zip(*((tok.name, regex) for tok, regex in tokens_regexes.items()))))
self.regex = re.compile(regex)
def tokenize(self, string, skip=['space']):
# TODO: detect invalid input
lineno, pos = 0, 0
skip = set(map(self.token_types.__getitem__, skip))
for matchobj in self.regex.finditer(string):
type_name = matchobj.lastgroup
value = matchobj.groupdict()[type_name]
Type = self.token_types[type_name]
if Type == self.token_types.newline: # possibly buggy, but not catastrophic
self.lineno += 1
self.pos = 0
continue
pos = matchobj.end()
if Type not in skip:
yield Token(Type, value, lineno, pos)
yield Token(self.token_types.EOF, '', lineno, pos)
Part II: the parser (with syntax-driven evaluation):
This parses the given stream of tokens provided by lex.Lexer.tokenize and translates individual symbols to English according to the following grammar:
Opt_list -> Option Opt_list_
Opt_list_ -> comma Option Opt_list_ | empty
Option -> Choice | Mult
Choice -> Compound More_choices Exchange
Compound -> item Add_item
Add_item -> plus item Add_item | empty
More_choices -> slash Compound More_choices | empty
Exchange -> minus num | empty
Mult -> num star Compound
The uppercase symbols are nonterminals, the lowercase ones are terminals. There's also a special symbol EOF that's not present here.
Also, take a look at the vital statistics of this grammar. This grammar is LL(1), so we can use an LL(1) recursive descent predictive parser, as shown below.
If you modify the grammar, you should modify the parser accordingly! The methods that do the actual parsing are called parse_<something>, and to change the output of the parser (the Parser.parse function, actually) you should change the return values of these parse_<something> functions.
# FILE: parse.py
import lex
class Parser:
def __init__(self, lexer):
self.string, self.tokens = None, None
self.lexer = lexer
self.t = self.lexer.token_types
self.__lookahead = None
#property
def lookahead(self):
if not self.__lookahead:
try:
self.__lookahead = next(self.tokens)
except StopIteration:
self.__lookahead = lex.Token(self.t.EOF, '', 0, -1)
return self.__lookahead
def next(self):
if self.__lookahead and self.__lookahead.type == self.t.EOF:
return self.__lookahead
self.__lookahead = None
return self.lookahead
def match(self, token_type):
if self.lookahead.type == token_type:
return self.next()
raise SyntaxError(f'Expected {token_type}, got {self.lookahead.type}', ('<string>', self.lookahead.lineno, self.lookahead.pos, self.string))
# THE PARSING STARTS HERE
def parse(self, string):
# setup
self.string = string
self.tokens = self.lexer.tokenize(string)
self.__lookahead = None
self.next()
# do parsing
ret = [''] + self.parse_opt_list()
return ' '.join(ret)
def parse_opt_list(self) -> list:
ret = self.parse_option(1)
ret.extend(self.parse_opt_list_(1))
return ret
def parse_opt_list_(self, curr_opt_number) -> list:
if self.lookahead.type in {self.t.EOF}:
return []
self.match(self.t.comma)
ret = self.parse_option(curr_opt_number + 1)
ret.extend(self.parse_opt_list_(curr_opt_number + 1))
return ret
def parse_option(self, opt_number) -> list:
ret = [f'{opt_number}.']
if self.lookahead.type == self.t.item:
ret.extend(self.parse_choice())
elif self.lookahead.type == self.t.num:
ret.extend(self.parse_mult())
else:
raise SyntaxError(f'Expected {token_type}, got {self.lookahead.type}', ('<string>', self.lookahead.lineno, self.lookahead.pos, self.string))
ret[-1] += '\n'
return ret
def parse_choice(self) -> list:
c = self.parse_compound()
m = self.parse_more_choices()
e = self.parse_exchange()
if not m:
if not e:
ret = f'You may take {" ".join(c)}'
else:
ret = f'for every {e} models you may take item {" ".join(c)}'
elif m:
c.extend(m)
if not e:
ret = f'each model may take one of: {", ".join(c)}'
else:
ret = f'for every {e} models you may exchange the following items with each other: {", ".join(c)}'
else:
ret = 'Semantic error!'
return [ret]
def parse_compound(self) -> list:
ret = [self.lookahead.value]
self.match(self.t.item)
_ret = self.parse_add_item()
return [' '.join(ret + _ret)]
def parse_add_item(self) -> list:
if self.lookahead.type in {self.t.comma, self.t.minus, self.t.slash, self.t.EOF}:
return []
ret = ['with']
self.match(self.t.plus)
ret.append(self.lookahead.value)
self.match(self.t.item)
return ret + self.parse_add_item()
def parse_more_choices(self) -> list:
if self.lookahead.type in {self.t.comma, self.t.minus, self.t.EOF}:
return []
self.match(self.t.slash)
ret = self.parse_compound()
return ret + self.parse_more_choices()
def parse_exchange(self) -> str:
if self.lookahead.type in {self.t.comma, self.t.EOF}:
return ''
self.match(self.t.minus)
ret = self.lookahead.value
self.match(self.t.num)
return ret
def parse_mult(self) -> list:
ret = [f'each model may take {self.lookahead.value} of:']
self.match(self.t.num)
self.match(self.t.star)
return ret + self.parse_compound()
Part III: usage
Here's how to use all of that code:
# FILE: evaluate.py
import enum
from lex import Lexer
from parse import Parser
# these are all the types of tokens present in our grammar
token_types = enum.Enum('Types', 'item num plus minus star slash comma space newline empty EOF')
t = token_types
# these are the regexes that the lexer uses to recognise the tokens
terminals_regexes = {
t.item: r'[a-zA-Z_]\w*',
t.num: '0|[1-9][0-9]*',
t.plus: r'\+',
t.minus: '-',
t.star: r'\*',
t.slash: '/',
t.comma: ',',
t.space: r'[ \t]',
t.newline: r'\n'
}
lexer = Lexer(token_types, terminals_regexes)
parser = Parser(lexer)
string = 'itemA, itemB/itemC-3, 2*itemD, itemE/itemF/itemG, itemH/itemI+itemJ'
print(f'STRING FROM THE QUESTION: {string!r}\nRESULT:')
print(parser.parse(string), '\n\n')
string = input('Enter a command: ')
while string and string.lower() not in {'q', 'quit', 'e', 'exit'}:
try:
print(parser.parse(string))
except SyntaxError as e:
print(f' Syntax error: {e}\n {e.text}\n' + ' ' * (4 + e.offset - 1) + '^\n')
string = input('Enter a command: ')
Example session:
# python3 evaluate.py
STRING FROM THE QUESTION: 'itemA, itemB/itemC-3, 2*itemD, itemE/itemF/itemG, itemH/itemI+itemJ'
RESULT:
1. You may take itemA
2. for every 3 models you may exchange the following items with each other: itemB, itemC
3. each model may take 2 of: itemD
4. each model may take one of: itemE, itemF, itemG
5. each model may take one of: itemH, itemI with itemJ
Enter a command: itemA/b/c/stuff
1. each model may take one of: itemA, b, c, stuff
Enter a command: 4 * anything
1. each model may take 4 of: anything
Enter a command: 5 * anything + more
1. each model may take 5 of: anything with more
Enter a command: a + b + c+ d
1. You may take a with b with c with d
Enter a command: a+b/c
1. each model may take one of: a with b, c
Enter a command: itemA/itemB-2
1. for every 2 models you may exchange the following items with each other: itemA, itemB
Enter a command: itemA+itemB/itemC - 5
1. for every 5 models you may exchange the following items with each other: itemA with itemB, itemC
Enter a command: q
Okay, so I'm trying to build a parser of my mini-language (obviously), and setting variables seems to be properly working. But as soon as Yacc comes across a function definition, it just gives me a syntax error, and a couple of EOF errors (which I know are from when Yacc has no remaining rules to set) and nothing else happens... Where did I go wrong?
Here's an example of the syntax I'm parsing:
$name = "John Doe"
$age = 72
$waterInOceans = 95.4
!testFunction {
}
Where the !testFunction { } section is defining a function (based off of the exclamation point). I don't know if that's going to be useful in debugging.
# The Lexer
import ply.lex as lex
tokens = ["MINUS", "SEPARATOR", "MODIFIER", "FUNCTION_NAME", "UNDEF_BLOCK", "VARIABLE_NAME", "EQUALS", "STRING", "FLOAT", "INNER_CONTENT", "ARGUMENTS", "INTEGER", "PLUS"]
def t_ARGUMENTS(t): # Finds arguments in calls and function definitions
r'\(.*\)'
t.value = t.value[1:-1] # strip parenthesis
t.value = t.value.split(" && ")
return t
def t_STRING(t): # finds strings
r'"\w.+"'
t.value = t.value[1:-1] # strips the quotation marks of the string
return t
def t_FLOAT(t): # finds floats
r'\d+.\d+'
t.value = float(t.value)
return t
def t_INTEGER(t):
r'\d+'
t.value = int(t.value)
return t
def t_VARIABLE_NAME(t):
r'\$\w*\b'
t.value = t.value[1:]
return t
def t_INNER_CONTENT(t):
r'\{\n.*\n\}|\{.*\}'
t.value = t.value[1:-1]
return t
def t_FUNCTION_NAME(t):
r'!\w+'
t.value = t.value[1:]
return t
t_ignore = r"\n|\t|\r"
t_EQUALS = r"\="
t_PLUS = r"\+"
t_MINUS = r"-"
t_MODIFIER = r"\."
t_SEPARATOR = r"\,"
t_UNDEF_BLOCK = r"\w+" # Any block of text that is left over and isn't assigned by the end (used by functions)
def t_error(t):
t.lexer.skip(1)
lex.lex()
#opened = open("example.zeq", "r")
#content = opened.read()
#opened.close()
#lex.input(content)
And then the Yacc half:
# The Yacc parser
import ply.yacc as yacc
import compiler # Get the compiler (tokenizer; compiler.py) which generates tokens
import sys
from os import system
##############
### IGNORE ###
tokens = compiler.tokens
#system("clear")
print("Executing "+sys.argv[1]+" |\n"+("-"*(len(sys.argv[1])+12)))
### IGNORE ###
##############
VARIABLES = {}
FUNCTIONS = {}
def p_assign(p): # Set new variable
'''assignment : VARIABLE_NAME EQUALS compound
| VARIABLE_NAME EQUALS STRING
| VARIABLE_NAME EQUALS INTEGER
| VARIABLE_NAME EQUALS FLOAT'''
#print("Setting '{}' to '{}'...".format(str(p[1]), str(p[3])))
VARIABLES[p[1]] = p[3]
def p_number(p): # Combines floats and integers into a blanket non-terminal for simplicity sakes
'''number : FLOAT
| INTEGER'''
p[0] = p[1]
def p_compound(p): # Complete the value *before* the variable is assigned!
'''compound : number PLUS number
| number MINUS number'''
type1 = type(p[1])
type2 = type(p[3])
operator = p[2]
if operator == "+":
p[0] = p[1] + p[3]
elif operator == "-":
p[0] = p[1] - p[3]
def p_undefined(p):
'''undefined : UNDEF_BLOCK'''
print("Undefined block")
def p_function(p):
'''function : FUNCTION_NAME INNER_CONTENT'''
print("Creating a function")
name = p[1]
content = p[2]
FUNCTIONS[name] = content
def p_empty(p):
'''empty : '''
#~ def p_error(p):
#~ if p:
#~ print("Syntax error: "+p.type)
#~ else:
#~ pass
parser = yacc.yacc()
opened = open(sys.argv[1], "r")
content = opened.read()
opened.close()
for line in content.splitlines():
parser.parse(line)
print(VARIABLES)
print(FUNCTIONS)
I'm waiting for it to be a simple overlooked detail...
When you ask Ply (or yacc, for that matter) to parse an input, it attempts to recognize a single instance of the top-level non-terminal (or "starting symbol"). This will usually a grammatical description of the entire input, so it will often have a name like program, although there are use cases in which it is useful to parse just a part of the input.
Ply (and yacc) assume that the first grammar production is for the starting symbol. In your case, the first production is assignment, and so that is what it will try to parse (and nothing else). assignment cannot derive a function definition or any other statement type, so those cause syntax errors.
If you want to explicitly tell Ply what the top-level symbol is, you can do so. See the manual section on starting symbols.
I would like a compact way to parse one-line strings that start with mandatory list-elements (unspecified number) and ends with dictionary-like definitions using =.
The element-separator should be , and spaces should become part of the element -- which rules out shlex, I think.
Spaces should/may be stripped at the start and end (quotes, too)
If an element would contain a , the user is required to quote with "
either "key=value,with,comma"
or key="value,with,comma" -- whatever is easier to implement
It's ok to have undefined behavior with wrong quoting or with elements containing a quote-char.
Behaviour with double keys is also undefined.
Slight variations of this are ok if it simplifies the implementation a lot.
Lets call the function opts and have it return a list and a dict,
Here are some input examples and desired results:
opts('dog,cat') # -> ["dog", "cat"], {}
opts('big fish,cat') # -> ["big fish", "cat"], {}
opts('"a dog, a cat",a fish') # -> ["a dog, a cat", "a fish"], {}
opts('key=value') # -> [] {'key':'value'}
opts('key=the value,x=y') # -> [] {'key':'the value', 'x':'y'}
opts('dog, big fish, eats="any, but peas", flies = no! '
# -> ['dog','big fish'], {'eats':'any, but peas', 'flies':'no!' }
I disregarded shlex, argparse, optparse and configparser, I can't see how I should do it with those. I am not sure if Regular Expressions crack this nut, though. json is a bit too strict with the syntax, I think. As is eval, if a bit more to my liking (because it parses python ;-))
My manual solution in macro is not very flexible and I would like to have its parameter handling be replaced by the more general opts(s) function described above:
def macro(s):
kw = { 'see':u"\\see", 'type':u"Chapter", 'title': u'??' }
params = s.split(",")
kw['label'] = params[0]
if len(params) > 1: # very inflexible
kw['title'] = params[1]
for param in params[2:]: # wrong if p[1] is already key=value
key, value = param.split("=",1) # doesn't handle anything, too simple
kw[key] = value
# ...rest of code...
The goal is to have the reusable function opts to be used here:
def macro_see(s):
ls, kw = opts(s)
# ...rest of code...
In this solution, opts is essentially the same as yuvi's (with the added strip). The splitter is a customization of shlex, using posix mode to handle quotes.
def mylex(x):
lex = shlex.shlex(x, posix=True)
lex.whitespace = ','
lex.whitespace_split = True
return list(lex)
def opts(x):
ll = []
dd = {}
items = mylex(x)
for item in items:
if '=' in item:
k, v = item.split('=',1)
dd[k.strip(' "')] = v.strip(' "')
else:
ll.append(item.strip(' "'))
return (ll,dd)
It passes:
trials = [
['dog,cat',(["dog", "cat"], {})],
['big fish,cat',(["big fish", "cat"], {})],
['"a dog, a cat",a fish',(["a dog, a cat", "a fish"], {})],
['key=value',([], {'key':'value'})],
['key=the value,x=y',([], {'key':'the value', 'x':'y'})],
['dog, big fish, eats="any, but peas", flies = no!',(['dog','big fish'], {'eats':'any, but peas', 'flies':'no!' })],
]
for (x,y) in trials:
print('%r'%x)
args = opts(x)
print(args)
if args != y:
print('error, %r'%y)
print('')
What you probably want is to create your own split function, with a flag that toggles when " are introduced. Something like this:
def my_split(string, deli):
res = []
flag = True
start = 0
for i, c in enumerate(string):
if c == '"':
if flag:
flag = False
else:
flag = True
if c == deli and flag:
res.append(string[start:i])
start = i+1
res.append(string[start:])
return res
From there, it's really easy to proceed:
def opts(s):
items = map(lambda x: x.strip(), my_split(s, ','))
# collect
ls = []
kw = {}
for item in items:
if '=' in item:
k, v = item.split('=', 1)
kw[k.strip()] = v.strip()
else:
ls.append(item)
return ls, kw
It's not perfect, there are still a few thing you might need to work on, but that's definetly a start.
Here's an approach where I massage the input so it matches the syntax requirements for python function arguments, then harness the python interpreter via eval to parse them.
import re
s = 'hog, "cog" , dog, bog, "big fish", eats="any, but peas", flies = "no!" '
# I think this will add quotes around any unquoted positional arguments
s = re.sub('(^|,)\ *([^\"\',\ ]+)\ *(?=,|$)', r'\1"\2"', s)
def f(*args, **kwargs):
return (args, kwargs)
print eval("f("+s+")", {'f':f})
output:
(('hog', 'cog', 'dog', 'bog', 'big fish'), {'flies': 'no!', 'eats': 'any, but peas'})
I am working on a simple DSL to transform data extracted from MongoDB. I am using python and pyparsing and have gotten reasonably far in creating a grammar that works for basic operators like +/-*, starting from the examples provided. I am currently stuck on how to get my program to evaluate functions of the form Rank[dbRef]. I can evaluate and do arithmetic on dbRefs through the simple operators, but something is not working with my recursion in evaluating functions. I cannot figure out how to access the dbRef argument that was passed in the function call.
Here is the grammar and associated setParseActions:
# Define parser, accounting for the fact that some fields contain whitespace
chars = Word(alphanums + "_-/")
expr = Forward()
integer = Word(nums).setParseAction(EvalConstant)
real = Combine(Word(nums) + "." + Word(nums)).setParseAction(EvalConstant)
# Handle database field references that are coming out of Mongo
dbRef = Combine(chars + OneOrMore(":") + chars)
dbRef.setParseAction(EvalDBref)
# Handle function calls
functionCall = (Keyword("Rank") | Keyword("ZS") | Keyword("Ntile")) + "[" + dbRef + "]"
functionCall.setParseAction(EvalFunction)
operand = (real | integer) | functionCall | dbRef
signop = oneOf('+ -')
multop = oneOf('* /')
plusop = oneOf('+ -')
# Use parse actions to attach Eval constructors to sub-expressions
expr << operatorPrecedence(operand,
[
(signop, 1, opAssoc.RIGHT, EvalSignOp),
(multop, 2, opAssoc.LEFT, EvalMultOp),
(plusop, 2, opAssoc.LEFT, EvalAddOp),
])
formulas = ['Rank[Person:Height]']
for f in formulas:
ret = expr.parseString(f)[0]
print p + ": " + line + " --> " + str(ret.eval())
Here is the relevant code for my evaluation class. The class DOES get called by the parser, but how do I access the argument that is passed to the function?
# Executes functions contained in expressions
class EvalFunction(object):
def __init__(self, tokens):
self.value = tokens[0]
def eval(self):
func = self.value
if func == 'Rank':
# How to evaluate the token that is arg of Function?
return 'Rank Found';
I think I just need a nudge in the right direction to get to the next stage ..
I sorted this out and wanted to provide an answer. My function evaluation class looks like this:
# Executes functions contained in expressions
class EvalFunction(object):
pop_ = {}
def __init__(self, tokens):
self.func_ = tokens.funcname
self.field_ = tokens.arg
def eval(self):
# Get the name of the requested field and source db
# Functions can only be called on dbRef so this always done
v = self.field_.value
fieldRef = v.split(':')
source = fieldRef[0]
field = fieldRef[1]
# Evaluate the dbRef (get the value from the db)
val = self.field_.eval()
if self.func_ == 'Avg':
rec = db['Stats'].find_one({'_id' : field})
return rec['value']['avg']
elif self.func_ == 'Root':
return math.sqrt(val)
my function grammar is:
functionCall = funcNames("funcname") + "[" + dbRef("arg") + "]"
functionCall.setParseAction(EvalFunction)