I am trying to process input file with description of algorithm behavior. I am using python's PLY module for defining lexer and parser. I stumbled upon problem of defining grammar which will enforce user to correctly write this file.
File
# Beginning of the first section
STATES = INITIATOR, IDLE, DONE;
INIT = INITIATOR, IDLE;
TERM = DONE;
# End of first section
# Beginning of the second section
INITIATOR
RANDOM
begin
SEND(x, NEIGHBORS);
BECOME(DONE);
end
IDLE
RECEIVE(x)
begin
SEND(x, NEIGHBORS);
BECOME(DONE);
end
# End of second section
Lexer
import ply.lex as lex
from soda.helpers import prepare_file
class Lexer(object):
keywords = (
'INIT', 'TERM', 'STATES', 'REGISTERS',
'begin', 'end',
'SEND', 'BECOME'
)
tokens = keywords + (
'NAME', 'EQUALS', 'COMMA', 'SEMICOLON',
'LPAREN', 'RPAREN'
)
# Tokens
t_EQUALS = r'='
t_COMMA = r','
t_SEMICOLON = r';'
t_STATES = r'STATES'
t_REGISTERS = r'REGISTERS'
t_INIT = r'INIT'
t_TERM = r'TERM'
t_begin = r'begin'
t_end = r'end'
t_SEND = r'SEND'
t_BECOME = r'BECOME'
t_LPAREN = r'\('
t_RPAREN = r'\)'
# Ignored characters
t_ignore = ' \t\n'
def t_NAME(self, t):
r'[a-zA-Z][a-zA-Z]*'
if t.value in self.keywords: # is this a keyword?
t.type = t.value
return t
def t_error(self, t):
print ("Illegal character {0} at line {1}".format(t.value[0], t.lineno))
t.lexer.skip(1)
def build(self, **kwargs):
self._lexer = lex.lex(module=self, **kwargs)
#prepare_file
def lexical_analysis(self, file):
print ("Started lexical analysis...")
for line in file:
try:
lex_input = line
except EOFError:
break
self._lexer.input(lex_input)
while True:
token = self._lexer.token()
if not token:
break
print (" ", token)
Parser
import ply.yacc as yacc
from soda.helpers import prepare_file
class Parser(object):
def p_algorithm(self, p):
''' algorithm : first_section second_section'''
def p_first_section(self, p):
''' first_section : STATES EQUALS states_list SEMICOLON
| REGISTERS EQUALS register_list SEMICOLON
| INIT EQUALS init_list SEMICOLON
| TERM EQUALS term_list SEMICOLON'''
def p_states_list(self, p):
''' states_list : state_term
| states_list COMMA state_term'''
def p_state_term(self, p):
''' state_term : NAME'''
self.behavior.states.append(p[1])
def p_register_list(self, p):
''' register_list : register_term
| register_list COMMA register_term'''
def p_register_term(self, p):
''' register_term : NAME'''
self.behavior.registers.append(p[1])
def p_init_list(self, p):
''' init_list : init_term
| init_list COMMA init_term'''
def p_init_term(self, p):
''' init_term : NAME'''
self.behavior.init_states.append(p[1])
def p_term_list(self, p):
''' term_list : term_term
| term_list COMMA term_term'''
def p_term_term(self, p):
''' term_term : NAME'''
self.behavior.term_states.append(p[1])
def p_second_section(self, p):
''' second_section : NAME begin commands end'''
def p_error(self, p):
print("Syntax error in input! -> {}".format(p))
def build(self, lexer, behavior):
self.lexer = lexer
self.behavior = behavior
self.tokens = lexer.tokens
self._parser = yacc.yacc(module=self)
#prepare_file
def parsing(self, file):
for line in file:
try:
parser_input = line
print (line)
except EOFError:
break
self._parser.parse(parser_input, lexer=self.lexer._lexer)
Parsing results in syntax error and I am not sure how to define rules to enforce the consistency of file with algorithm behavior. first_section is parsed ok and problem is second_section. My solution defines that algorithm : first_section second_section and it is not working. I tried to define it like algorithm: first_section | second_section and it works good but this rule states that first and second section can be switched in file.
So my question is how to enforce it with rules so user will keep the input file consistent.
Error output
enter STATES = INITIATOR, IDLE, DONE;
Syntax error in input! -> None
INIT = INITIATOR, IDLE;
Syntax error in input! -> None
TERM = DONE;
Syntax error in input! -> None
INITIATOR
Syntax error in input! -> LexToken(NAME,'INITIATOR',1,0)
begin
Syntax error in input! -> LexToken(begin,'begin',1,0)
Program just states there is error in syntax. Problem is not with lexical analysis but with defined grammar. I can define it in such way that input is accepted but for example user would be able to switch first_section with second_section.
Edit
I think it is not clear from this question what I want to achieve or my problem so I voted to close it. I came up with idea how to better state what I am looking for so I want to raise new question.
Oups! Your grammar parses the file line by line, which is at least uncommon and does not allow to control the ordering of lines. IMHO, you should parse the file as a whole. The trick is to pass the parser a tokenfunc function that will feed the lexer with one line at a time, and declare each section to be composed of lines:
class Parser(object):
def p_algorithm(self, p):
''' algorithm : first_section second_section'''
def p_first_section(self, p):
''' first_section : first_section_line
| first_section_line first_section'''
def p_first_section_line(self, p):
''' first_section_line : STATES EQUALS states_list SEMICOLON
| REGISTERS EQUALS register_list SEMICOLON
| INIT EQUALS init_list SEMICOLON
| TERM EQUALS term_list SEMICOLON'''
...
# same for second section...
#prepare_file
def parsing(self, file):
def get_token():
'a tokenizer that automatically feeds the lexer with the next line'
while True:
tok = self.lexer._lexer.token()
if tok is not None: return tok
try:
line = next(file)
self.lexer._lexer.input(line)
except StopIteration:
return None
self._parser.parse("", lexer=self.lexer._lexer, tokenfunc = get_token)
Related
The following code works well when the text file are in sequence with code i.e, Introduction then Information but gives error if Information comes before Introduction. What would be the solution to handle this using lex/yacc? Thank in advance.
import ply.lex as lex
# List of token names. This is always required
tokens = [
'CheckupInformation',
'Introduction',
'Information',
'perfect',
'sick',
'LPAREN',
'RPAREN',
'CHAR',
'NUMBER'
]
def t_CheckupInformation(t) : 'CheckupInformation' ; return t
def t_Introduction(t) : 'Introduction' ; return t
def t_Information(t) : 'Information' ; return t
def t_perfect(t): 'perfect'; return t
def t_sick(t) : 'sick'; return t
t_LPAREN = r'\('
t_RPAREN = r'\)'
t_CHAR = r'[a-zA-Z_][a-zA-Z0-9_\-]*'
t_ignore = " \t"
# Define a rule so we can track line numbers
def t_NUMBER(t):
r'[+\-0-9_][0-9_]*'
t.lexer.lineno += len(t.value)
try:
t.value = int(t.value)
except ValueError:
print("Integer value too large %s" % t.value)
t.value = 0
return t
def t_SEMICOLON(t):
r'\;.*'
t.lexer.lineno += len(t.value)
def t_newline(t):
r'\n+'
t.lexer.lineno += len(t.value)
# Error handling rule
def t_error(t):
print("Illegal character '%s'" % t.value[0])
t.lexer.skip(1)
# Build the lexer
lexer = lex.lex()
# define upper level classes first
class stat:
def __init__(self):
self.statement = ""
self.intro = list()
self.body = list()
P=stat()
def p_stat(p):
'Stat : LPAREN CheckupInformation statIntro statBody RPAREN'
p[0]=(p[1],p[2],p[3],p[4],p[5])
def p_Intro(p) :
'''statIntro : LPAREN Introduction Name RPAREN
| statIntro LPAREN Introduction Name RPAREN
| empty'''
if len(p)==5:
p[0] = (p[3])
elif len(p)==6:
p[0] = (p[4])
else:
p[0]= None
P.intro.append(p[0])
def p_Name(p):
'Name : CHAR'
p[0]=p[1]
def p_Body(p):
'''statBody : LPAREN Information bodyinfo RPAREN
| statBody LPAREN Information bodyinfo RPAREN'''
if len(p)==5:
p[0] = (p[3])
elif len(p)==6:
p[0] = (p[4])
P.body.append(p[0])
def p_bodyinfo(p):
'''bodyinfo : LPAREN CHAR perfect RPAREN
| LPAREN CHAR sick RPAREN'''
p[0]=p[2],p[3]
def p_empty(p):
'empty : '
print("This function is called")
pass
def p_error(p):
print("Syntax error in input '%s'!" % p.value)
import ply.yacc as yacc
parser = yacc.yacc()
import sys
if len(sys.argv) < 2 :
sys.exit("Usage: %s <filename>" % sys.argv[0])
fp = open(sys.argv[1])
contents=fp.read()
result=parser.parse(contents)
print("(CheckupInformation")
if (P.intro) != None:
for x in range(len(P.intro)):
print(" (Introduction %s)" %(P.intro[x]))
for x in range(len(P.body)):
print(" (Information( %s %s))" %(P.body[x]))
print(")")
The code works well for file1 & cannot handle file2.
ERROR:
Syntax error in input '(Introduction'!
(CheckupInformation
(Introduction None)
(Information( Anonymous1 perfect))
)
File1:
(CheckupInformation
(Introduction John)
(Introduction Patt)
(Information(Anonymous1 perfect))
(Information(Anonymous2 sick))
)
File2:
(CheckupInformation
(Information(Anonymous1 perfect))
(Information(Anonymous2 sick))
(Introduction John)
(Introduction Patt)
)
This might not be the answer you wanted, but I found myself unable to just change one or two lines in your code. The following is still far from perfect, but I think it is approaching a reasonable approach to your problem. I tried to annotate it with useful comments. Please read through it carefully and try to understand why I did what I did, referring to the Ply manual as necessary (some references are in the code comments, but there's lots of useful background information in the document which I didn't reference specifically).
Good luck.
import ply.lex as lex
# Keyword handling copied from the Ply manual, https://www.dabeaz.com/ply/ply.html#ply_nn6
reserved = {
'CheckupInformation': 'TK_CheckupInformation',
'Introduction': 'TK_Introduction',
'Information': 'TK_Information',
'perfect': 'TK_perfect',
'sick': 'TK_sick',
}
# I changed CHAR to WORD because CHAR sounds like a character
tokens = ['NUMBER','WORD'] + list(reserved.values())
def t_WORD(t):
r'[a-zA-Z_][a-zA-Z0-9_-]*'
t.type = reserved.get(t.value,'WORD') # Check for reserved words
return t
# See the Ply manual: https://www.dabeaz.com/ply/ply.html#ply_nn11
literals = '()'
# See the Ply manual: https://www.dabeaz.com/ply/ply.html#ply_nn8
t_ignore = ' \t\n'
t_ignore_COMMENT = r'\;.*'
# Fixed the regex. You can't have a sign in the middle of a number.
def t_NUMBER(t):
r'[+-]?[0-9_]+'
try:
t.value = int(t.value)
except ValueError:
print("Integer value too large %s" % t.value)
t.value = 0
return t
# See below for the definition of lineno_for_token
# Error handling rule
def t_error(t):
print("Illegal character '%s' at line %d'" % (
t.value[0], t.lexer.lineno_for_token(t)))
t.lexer.skip(1)
# Build the lexer
lexer = lex.lex()
# Ply tracks the character index automatically as lexer.lexpos, and every
# token it produces has a lexpos attribute. So there is no need to misuse
# the lineno attribute for that purpose. It should be the line number of
# the token, as its name indicates.
# You don't seem to use lineno (or lexpos) anywhere, but it is handy for
# error messages. But since it is used rarely, it's easier to compute it
# on demand by counting newlines to the lex position.
# Perhaps this should just be added to the lexer we just built.
lex.Lexer.lineno_for_token = lambda self, t: 1 + self.lexdata.count('\n', 0, t.lexpos)
# Fixed this to use an upper-class name and to derive from object.
# Object to hold a top-level description
class Stat(object):
# Attributes used for components
components = {'intro', 'body'}
def __init__(self, component_dict):
self.statement = "" # I don't know what this is used for
# Copy the components dictionary as attributes, using
# an empty list as default
for k in self.components:
setattr(self, k, component_dict.get(k, ()))
# Verify that we used every key in the dict.
for k in component_dict.keys():
if k not in self.components:
print("Warning! Ignoring " + k
+ " because it is not in Stat.components")
# Arrange for the object to print as expected
def __repr__(self):
return '(CheckupInformation %r %r)' % (self.intro, self.body)
# Instead of having a global "P" object (whose name is not very useful),
# we return a Stat object
def p_stat(p):
""" stat : '(' TK_CheckupInformation components ')' """
p[0] = Stat(p[3])
# We allow all components to be optional and order independent here. We
# also allow them all to be repeated. But that could be made more precise.
# components is a dictionary whose values are lists
def p_components_empty(p):
""" components : """
p[0] = { }
def p_components_append(p):
""" components : components component """
p[0] = p[1]
# The component is a two-element tuple
key, value = p[2]
if key in p[0]:
p[0][key].append(value)
else:
p[0][key] = [value]
# Syntax for each component type (just one element, not a list)
# component is a tuple of (key, value)
# All of the productions just copy the value from some specific syntax.
def p_component(p):
""" component : statIntro
| statBody
"""
p[0] = p[1]
def p_statIntro(p):
"""statIntro : '(' TK_Introduction WORD ')' """
p[0] = ('intro', p[3])
def p_statBody(p):
"""statBody : '(' TK_Information bodyinfo ')' """
p[0] = ('body', p[3])
# bodyinfo is a tuple of (identifier, status)
def p_bodyinfo(p):
"""bodyinfo : '(' WORD TK_perfect ')'
| '(' WORD TK_sick ')'
"""
p[0] = (p[2],p[3])
def p_error(p):
print("Syntax error in input '%s'! at line %d" % (
p.value, p.lexer.lineno_for_token(p)))
import ply.yacc as yacc
parser = yacc.yacc()
# Only do this if we're called from the command line
if __name__ == "__main__":
import sys
if len(sys.argv) < 2 :
sys.exit("Usage: %s <filename>" % sys.argv[0])
with open(sys.argv[1]) as fp:
stat = parser.parse(fp.read())
if stat is not None:
print("(CheckupInformation")
for x in range(len(stat.intro)):
print(" (Introduction %s)" %(stat.intro[x]))
for x in range(len(stat.body)):
print(" (Information( %s %s))" %(stat.body[x]))
print(")")
I'm trying to write boolean expression parser with variables using sly as a lexer and parser library. I'm trying to define case insensitive constants "true" and "false" but have some issues with variable names which starts from that constants name. For example variable "falseAlarm" is parsed as "false" constant and "Alarm" variable so I get a syntax error. I am not very good at parsers so I really have no idea hot to make it right.
Here is my code:
from sly import Lexer, Parser
from dataclasses import dataclass, field
from typing import List
from pprint import pprint
import re
class Lex(Lexer):
tokens = {
LB,
RB,
AND,
OR,
NOT,
TRUE,
FALSE,
ID,
}
ignore = ' \t'
ignore_newline = r'\n+'
LB = r'\('
RB = r'\)'
AND = r'\&\&'
OR = r'\|\|'
NOT = r'(?i)not'
TRUE = r'(?i)true'
FALSE = r'(?i)false'
ID = r'[a-zA-Z][a-zA-Z0-9_]*'
class Pax(Parser):
debugfile = 'parser.out'
tokens = Lex.tokens
#_('boolean_expression boolean_operator boolean_term')
def boolean_expression(self, p):
return (p.boolean_operator, [p.boolean_expression, p.boolean_term])
#_('boolean_term')
def boolean_expression(self, p):
return [p.boolean_term]
#_('AND')
def boolean_operator(self, p):
return p.AND
#_('OR')
def boolean_operator(self, p):
return p.OR
#_('LB boolean_expression RB')
def boolean_term(self, p):
return p.boolean_expression
#_('NOT boolean_term')
def boolean_term(self, p):
return ('not', [p.boolean_term])
#_('boolean_constant')
def boolean_term(self, p):
return p.boolean_constant
#_('ID')
def boolean_term(self, p):
return ('variable', p.ID)
#_('TRUE')
#_('FALSE')
def boolean_constant(self, p):
return ('constant', p)
def error(self, p):
if p:
print(f'Error at token {p.type}, {p.value} at line {p.lineno} col {p.index}')
self.errok()
else:
print('Syntax error at EOF')
TEXT = """
(true || false && true) || falseAlarm
"""
def tokens():
for t in Lex().tokenize(TEXT):
print(t)
yield t
res = Pax().parse(tokens())
print()
pprint(res, indent=4, width=1)
You could change your regex to include word boundaries, i.e. FALSE = r'\bfalse\b'
import operator
import re
from ply import lex, yacc
class Lexer(object):
tokens = [
'COMMA',
'TILDE',
'PARAM',
'LP',
'RP',
'FUNC'
]
# Regular expression rules for simple tokens
t_COMMA = r'\,'
t_TILDE = r'\~'
t_PARAM = r'[^\s\(\),&:\"\'~]+'
def __init__(self, dict_obj):
self.dict_obj = dict_obj
def t_LP(self, t):
r'\('
return t
def t_RP(self, t):
r'\)'
return t
def t_FUNC(self, t):
# I want to generate token for this FUNC from the keys of model map
# For eg: r'key1|key2'
r'(?i)FUNC'
return t
# Define a rule so we can track line numbers
def t_newline(self, t):
r'\n+'
t.lexer.lineno += len(t.value)
# A string containing ignored characters (spaces and tabs)
t_ignore = ' \t'
# Error handling rule
def t_error(self, t):
print("Illegal character '%s' on line %d, column %d" % (t.value[0], t.lexer.lineno, t.lexer.lexpos))
t.lexer.skip(1)
# Build the lexer
def build_lexer(self, **kwargs):
self.lexer = lex.lex(module=self, **kwargs)
return self.lexer
class Parser(object):
tokens = Lexer.tokens
def __init__(self, **kwargs):
self.parser = yacc.yacc(module=self, **kwargs)
self.lexer = None
self._dict_obj = None
self.error = ""
self.result = ""
#property
def dict_obj(self):
return self._dict_obj
#dict_obj.setter
def dict_obj(self, dict_obj):
self._dict_obj = dict_obj
self.lexer = Lexer(self._dict_obj).build_lexer()
# Handles LP expression RP
def p_expression(self, p):
"""
expression : LP expression RP
"""
# Handles TILDE PARAM - call search
def p_tilde_param(self, p):
"""
expression : TILDE PARAM
"""
p[0] = p[2]
return p[0]
# Handles ANY LP PARAM RP - call search
def p_expression_any(self, p):
"""
expression : FUNC LP PARAM RP
"""
p[0] = p[3]
return p[0]
# Error handling rule
def p_error(self, p):
if p:
stack_state_str = " ".join([symbol.type for symbol in self.parser.symstack[1:]])
self.error = "Syntax error at %s, type %s, on line %d, Parser state: %s %s . %s" % (
p.value, p.type, p.lineno, self.parser.state, stack_state_str, p
)
else:
self.error = "SYNTAX ERROR IN INPUT"
def get_result(self, input_):
input_ = input_.strip()
if input_:
self.result = self.parser.parse(input_, lexer=self.lexer)
return self.result
else:
raise ValueError("EMPTY EXPRESSION ERROR")
def parser(input_):
par_obj = Parser()
par_obj.dict_obj = {
'key1' : 'value1',
'key2' : 'value2'
}
return par_obj.get_result(input_)
result = parser("~hello")
Above is the code of lexer and parser using ply library. I have just encapsulated all of my code in the class form. Problems which i am facing:
1.) I'm trying to pass a dict_obj to the parser class. I don't know what i am doing wrong and getting an error like :
AttributeError: 'Parser' object has no attribute 'dict_obj'
2.) What I'm trying to do?
I want to pass this dict_obj to the parser class and then pass it to the lexer class as well and then make use of it in the lexer one of the tokens methods (t_FUNC) method. In this method my regex will return keys of the this dict obj.
I think i'm doing something wrong and hence failing to implement it. Please help.
In your constructor (__init__) for the Parser object, you ask Ply to generate a parser before the Parser object is fully constructed:
def __init__(self, **kwargs):
self.parser = yacc.yacc(module=self, **kwargs)
# This is the critical line:
self._dict_obj = None
In order to construct a parser from the object (yacc.yacc(module=self)), Ply needs to iterate over all the object's attributes. For example, it needs to find all the parser functions in order to extract their docstrings in order to determine the grammar.
Ply uses the dir built-in function to make a dictionary of all the object's attributes. Because your Parser object has a custom attribute dict_obj, that key is returned from dir and so Ply tries to cache that attribute with its value. But when it calls gettattr(module, 'dict_obj'), the getter is called, and the getter tries to return self._dict_obj. However, self._dict_obj has not yet been defined, so that ends up throwing an error:
AttributeError: 'Parser' object has no attribute '_dict_obj'
Note that this is not the error message you reported in your question; that error says that there is no attribute dict_obj. Perhaps that was a copy-and-paste error.
If you move the call to yacc.yacc to the end of the initialiser, that particular problem goes away:
def __init__(self, **kwargs):
self.lexer = None
self._dict_obj = None
self.error = ""
self.result = ""
self.parser = yacc.yacc(module=self, **kwargs)
However, there are a number of other problems in the code excerpt which make it difficult to verify this solution. These include:
There is no LexerNmsysSearch. I assumed you meant Lexer.
There is no node_expression. I have no idea what that is supposed to be so I just removed the test.
Your grammar does not match the input you are testing, so the parser immediately throws a syntax error. I changed the input to "(~hello)" in an attempt to produce something parseable.
The parser actions do not set semantic values, so self.parse.parse() doesn't return any value. This causes get_result to throw an error.
At that point, I gave up on trying to produce anything sensible out of the code. For future reference, please ensure that error messages are quoted exactly and that sample code included in the question can be run.
I am trying to make a simple programme that can help make army lists for a popular tabletop wargame. More as an excercise for my own experience as there are plenty of pre made software packages that do this, but the idea behind it seems fairly straightforward
The programme reads the data for all the units available in an army from a spreadsheet and creates various classes for each unit. The main bit I am looking at now is the options/ upgrades.
In the file I want a straightforward syntax for the option field for each unit. i.e. the following options string itemA, itemB/itemC-3, 2*itemD, itemE/itemF/itemG, itemH/itemI+itemJ would mean
1. you may take itemA (X pts per model)
2. for every 3 models, you may exchange itemB with
a) itemC (net X pts per model)
3. each model may take 2 of itemD (X pts per model)
4. each model may take one of either
a)itemE (X pts per model)
b)itemF (X pts per model)
c)itemG (X pts per model
5. each model may take either
a)itemH (X points per model)
b)itemI and itemJ (X points per model)
At the moment I am processing the string using lots of splits and if statements, that make it very hard to keep track of and assign correctly once the user input their choice.
for index, option in enumerate(self.options):
output = "{}.".format(index+1)
if '-' in option:
sub_option, no_models = option.split('-')
no_models = int(no_models)
print(sub_option)
print(no_models)
output += "For every {} models ".format(no_models)
if '/' in sub_option:
temp_str, temp_options, points_list = exchange_option(sub_option)
else:
temp_str, temp_options, points_list = standard_option(sub_option)
index_points.append(points_list)
temp_options.append(no_models)
index_options.append(temp_options)
else:
if '/' in option:
temp_str, temp_options, points_list = exchange_option(option)
else:
temp_str, temp_options, points_list = standard_option(option)
index_points.append(points_list)
index_options.append(temp_options)
output += temp_str
the *_option() functions are additional helper functions I have defined above which have a similar structure with further if statements within them.
The main question I am asking, is there an easier way to process a code like string such as this? While it works to produce the output in the example above it seems awfully cumbersome to then deal with the user input.
What I am aiming to do is first output the string as given in my example at the top of the question, and then taking the user input index of the given option, modify the associated unit class to have the correct wargear and points value.
I thought about trying to make some kind of options class, but again labelling and defining each option so that they can interact with one another properly seems equally complex, and I feel there must be something more pythonic or just generally better coding practice to processing encoded strings such as this?
So, here's a full blown parser to do that! Now, this only outputs the list as in the previous version of your question, but it shouldn't be too hard to add more features as you want. Also please note that at the moment, the lexer does not error out when a string contains invalid tokens, but that's just a proof-of-concept, so it should be fine.
Part I: the lexer
This tokenises the input string - looks through it from left to right and attempts to classify non-overlapping substrings as instances of tokens. It's to be used before parsing. When given a string, Lexer.tokenize yields a stream of Tokens.
# FILE: lex.py
import re
import enum
class Token:
def __init__(self, type, value: str, lineno: int, pos: int):
self.type, self.value, self.lineno, self.pos = type, value, lineno, pos
def __str__(self):
v = f'({self.value!r})' if self.value else ''
return f'{self.type.name}{v} at {self.lineno}:{self.pos}'
__repr__ = __str__
class Lexer:
def __init__(self, token_types: enum.Enum, tokens_regexes: dict):
self.token_types = token_types
regex = '|'.join(map('(?P<{}>{})'.format, *zip(*((tok.name, regex) for tok, regex in tokens_regexes.items()))))
self.regex = re.compile(regex)
def tokenize(self, string, skip=['space']):
# TODO: detect invalid input
lineno, pos = 0, 0
skip = set(map(self.token_types.__getitem__, skip))
for matchobj in self.regex.finditer(string):
type_name = matchobj.lastgroup
value = matchobj.groupdict()[type_name]
Type = self.token_types[type_name]
if Type == self.token_types.newline: # possibly buggy, but not catastrophic
self.lineno += 1
self.pos = 0
continue
pos = matchobj.end()
if Type not in skip:
yield Token(Type, value, lineno, pos)
yield Token(self.token_types.EOF, '', lineno, pos)
Part II: the parser (with syntax-driven evaluation):
This parses the given stream of tokens provided by lex.Lexer.tokenize and translates individual symbols to English according to the following grammar:
Opt_list -> Option Opt_list_
Opt_list_ -> comma Option Opt_list_ | empty
Option -> Choice | Mult
Choice -> Compound More_choices Exchange
Compound -> item Add_item
Add_item -> plus item Add_item | empty
More_choices -> slash Compound More_choices | empty
Exchange -> minus num | empty
Mult -> num star Compound
The uppercase symbols are nonterminals, the lowercase ones are terminals. There's also a special symbol EOF that's not present here.
Also, take a look at the vital statistics of this grammar. This grammar is LL(1), so we can use an LL(1) recursive descent predictive parser, as shown below.
If you modify the grammar, you should modify the parser accordingly! The methods that do the actual parsing are called parse_<something>, and to change the output of the parser (the Parser.parse function, actually) you should change the return values of these parse_<something> functions.
# FILE: parse.py
import lex
class Parser:
def __init__(self, lexer):
self.string, self.tokens = None, None
self.lexer = lexer
self.t = self.lexer.token_types
self.__lookahead = None
#property
def lookahead(self):
if not self.__lookahead:
try:
self.__lookahead = next(self.tokens)
except StopIteration:
self.__lookahead = lex.Token(self.t.EOF, '', 0, -1)
return self.__lookahead
def next(self):
if self.__lookahead and self.__lookahead.type == self.t.EOF:
return self.__lookahead
self.__lookahead = None
return self.lookahead
def match(self, token_type):
if self.lookahead.type == token_type:
return self.next()
raise SyntaxError(f'Expected {token_type}, got {self.lookahead.type}', ('<string>', self.lookahead.lineno, self.lookahead.pos, self.string))
# THE PARSING STARTS HERE
def parse(self, string):
# setup
self.string = string
self.tokens = self.lexer.tokenize(string)
self.__lookahead = None
self.next()
# do parsing
ret = [''] + self.parse_opt_list()
return ' '.join(ret)
def parse_opt_list(self) -> list:
ret = self.parse_option(1)
ret.extend(self.parse_opt_list_(1))
return ret
def parse_opt_list_(self, curr_opt_number) -> list:
if self.lookahead.type in {self.t.EOF}:
return []
self.match(self.t.comma)
ret = self.parse_option(curr_opt_number + 1)
ret.extend(self.parse_opt_list_(curr_opt_number + 1))
return ret
def parse_option(self, opt_number) -> list:
ret = [f'{opt_number}.']
if self.lookahead.type == self.t.item:
ret.extend(self.parse_choice())
elif self.lookahead.type == self.t.num:
ret.extend(self.parse_mult())
else:
raise SyntaxError(f'Expected {token_type}, got {self.lookahead.type}', ('<string>', self.lookahead.lineno, self.lookahead.pos, self.string))
ret[-1] += '\n'
return ret
def parse_choice(self) -> list:
c = self.parse_compound()
m = self.parse_more_choices()
e = self.parse_exchange()
if not m:
if not e:
ret = f'You may take {" ".join(c)}'
else:
ret = f'for every {e} models you may take item {" ".join(c)}'
elif m:
c.extend(m)
if not e:
ret = f'each model may take one of: {", ".join(c)}'
else:
ret = f'for every {e} models you may exchange the following items with each other: {", ".join(c)}'
else:
ret = 'Semantic error!'
return [ret]
def parse_compound(self) -> list:
ret = [self.lookahead.value]
self.match(self.t.item)
_ret = self.parse_add_item()
return [' '.join(ret + _ret)]
def parse_add_item(self) -> list:
if self.lookahead.type in {self.t.comma, self.t.minus, self.t.slash, self.t.EOF}:
return []
ret = ['with']
self.match(self.t.plus)
ret.append(self.lookahead.value)
self.match(self.t.item)
return ret + self.parse_add_item()
def parse_more_choices(self) -> list:
if self.lookahead.type in {self.t.comma, self.t.minus, self.t.EOF}:
return []
self.match(self.t.slash)
ret = self.parse_compound()
return ret + self.parse_more_choices()
def parse_exchange(self) -> str:
if self.lookahead.type in {self.t.comma, self.t.EOF}:
return ''
self.match(self.t.minus)
ret = self.lookahead.value
self.match(self.t.num)
return ret
def parse_mult(self) -> list:
ret = [f'each model may take {self.lookahead.value} of:']
self.match(self.t.num)
self.match(self.t.star)
return ret + self.parse_compound()
Part III: usage
Here's how to use all of that code:
# FILE: evaluate.py
import enum
from lex import Lexer
from parse import Parser
# these are all the types of tokens present in our grammar
token_types = enum.Enum('Types', 'item num plus minus star slash comma space newline empty EOF')
t = token_types
# these are the regexes that the lexer uses to recognise the tokens
terminals_regexes = {
t.item: r'[a-zA-Z_]\w*',
t.num: '0|[1-9][0-9]*',
t.plus: r'\+',
t.minus: '-',
t.star: r'\*',
t.slash: '/',
t.comma: ',',
t.space: r'[ \t]',
t.newline: r'\n'
}
lexer = Lexer(token_types, terminals_regexes)
parser = Parser(lexer)
string = 'itemA, itemB/itemC-3, 2*itemD, itemE/itemF/itemG, itemH/itemI+itemJ'
print(f'STRING FROM THE QUESTION: {string!r}\nRESULT:')
print(parser.parse(string), '\n\n')
string = input('Enter a command: ')
while string and string.lower() not in {'q', 'quit', 'e', 'exit'}:
try:
print(parser.parse(string))
except SyntaxError as e:
print(f' Syntax error: {e}\n {e.text}\n' + ' ' * (4 + e.offset - 1) + '^\n')
string = input('Enter a command: ')
Example session:
# python3 evaluate.py
STRING FROM THE QUESTION: 'itemA, itemB/itemC-3, 2*itemD, itemE/itemF/itemG, itemH/itemI+itemJ'
RESULT:
1. You may take itemA
2. for every 3 models you may exchange the following items with each other: itemB, itemC
3. each model may take 2 of: itemD
4. each model may take one of: itemE, itemF, itemG
5. each model may take one of: itemH, itemI with itemJ
Enter a command: itemA/b/c/stuff
1. each model may take one of: itemA, b, c, stuff
Enter a command: 4 * anything
1. each model may take 4 of: anything
Enter a command: 5 * anything + more
1. each model may take 5 of: anything with more
Enter a command: a + b + c+ d
1. You may take a with b with c with d
Enter a command: a+b/c
1. each model may take one of: a with b, c
Enter a command: itemA/itemB-2
1. for every 2 models you may exchange the following items with each other: itemA, itemB
Enter a command: itemA+itemB/itemC - 5
1. for every 5 models you may exchange the following items with each other: itemA with itemB, itemC
Enter a command: q
Okay, so I'm trying to build a parser of my mini-language (obviously), and setting variables seems to be properly working. But as soon as Yacc comes across a function definition, it just gives me a syntax error, and a couple of EOF errors (which I know are from when Yacc has no remaining rules to set) and nothing else happens... Where did I go wrong?
Here's an example of the syntax I'm parsing:
$name = "John Doe"
$age = 72
$waterInOceans = 95.4
!testFunction {
}
Where the !testFunction { } section is defining a function (based off of the exclamation point). I don't know if that's going to be useful in debugging.
# The Lexer
import ply.lex as lex
tokens = ["MINUS", "SEPARATOR", "MODIFIER", "FUNCTION_NAME", "UNDEF_BLOCK", "VARIABLE_NAME", "EQUALS", "STRING", "FLOAT", "INNER_CONTENT", "ARGUMENTS", "INTEGER", "PLUS"]
def t_ARGUMENTS(t): # Finds arguments in calls and function definitions
r'\(.*\)'
t.value = t.value[1:-1] # strip parenthesis
t.value = t.value.split(" && ")
return t
def t_STRING(t): # finds strings
r'"\w.+"'
t.value = t.value[1:-1] # strips the quotation marks of the string
return t
def t_FLOAT(t): # finds floats
r'\d+.\d+'
t.value = float(t.value)
return t
def t_INTEGER(t):
r'\d+'
t.value = int(t.value)
return t
def t_VARIABLE_NAME(t):
r'\$\w*\b'
t.value = t.value[1:]
return t
def t_INNER_CONTENT(t):
r'\{\n.*\n\}|\{.*\}'
t.value = t.value[1:-1]
return t
def t_FUNCTION_NAME(t):
r'!\w+'
t.value = t.value[1:]
return t
t_ignore = r"\n|\t|\r"
t_EQUALS = r"\="
t_PLUS = r"\+"
t_MINUS = r"-"
t_MODIFIER = r"\."
t_SEPARATOR = r"\,"
t_UNDEF_BLOCK = r"\w+" # Any block of text that is left over and isn't assigned by the end (used by functions)
def t_error(t):
t.lexer.skip(1)
lex.lex()
#opened = open("example.zeq", "r")
#content = opened.read()
#opened.close()
#lex.input(content)
And then the Yacc half:
# The Yacc parser
import ply.yacc as yacc
import compiler # Get the compiler (tokenizer; compiler.py) which generates tokens
import sys
from os import system
##############
### IGNORE ###
tokens = compiler.tokens
#system("clear")
print("Executing "+sys.argv[1]+" |\n"+("-"*(len(sys.argv[1])+12)))
### IGNORE ###
##############
VARIABLES = {}
FUNCTIONS = {}
def p_assign(p): # Set new variable
'''assignment : VARIABLE_NAME EQUALS compound
| VARIABLE_NAME EQUALS STRING
| VARIABLE_NAME EQUALS INTEGER
| VARIABLE_NAME EQUALS FLOAT'''
#print("Setting '{}' to '{}'...".format(str(p[1]), str(p[3])))
VARIABLES[p[1]] = p[3]
def p_number(p): # Combines floats and integers into a blanket non-terminal for simplicity sakes
'''number : FLOAT
| INTEGER'''
p[0] = p[1]
def p_compound(p): # Complete the value *before* the variable is assigned!
'''compound : number PLUS number
| number MINUS number'''
type1 = type(p[1])
type2 = type(p[3])
operator = p[2]
if operator == "+":
p[0] = p[1] + p[3]
elif operator == "-":
p[0] = p[1] - p[3]
def p_undefined(p):
'''undefined : UNDEF_BLOCK'''
print("Undefined block")
def p_function(p):
'''function : FUNCTION_NAME INNER_CONTENT'''
print("Creating a function")
name = p[1]
content = p[2]
FUNCTIONS[name] = content
def p_empty(p):
'''empty : '''
#~ def p_error(p):
#~ if p:
#~ print("Syntax error: "+p.type)
#~ else:
#~ pass
parser = yacc.yacc()
opened = open(sys.argv[1], "r")
content = opened.read()
opened.close()
for line in content.splitlines():
parser.parse(line)
print(VARIABLES)
print(FUNCTIONS)
I'm waiting for it to be a simple overlooked detail...
When you ask Ply (or yacc, for that matter) to parse an input, it attempts to recognize a single instance of the top-level non-terminal (or "starting symbol"). This will usually a grammatical description of the entire input, so it will often have a name like program, although there are use cases in which it is useful to parse just a part of the input.
Ply (and yacc) assume that the first grammar production is for the starting symbol. In your case, the first production is assignment, and so that is what it will try to parse (and nothing else). assignment cannot derive a function definition or any other statement type, so those cause syntax errors.
If you want to explicitly tell Ply what the top-level symbol is, you can do so. See the manual section on starting symbols.