Okay, so I'm trying to build a parser of my mini-language (obviously), and setting variables seems to be properly working. But as soon as Yacc comes across a function definition, it just gives me a syntax error, and a couple of EOF errors (which I know are from when Yacc has no remaining rules to set) and nothing else happens... Where did I go wrong?
Here's an example of the syntax I'm parsing:
$name = "John Doe"
$age = 72
$waterInOceans = 95.4
!testFunction {
}
Where the !testFunction { } section is defining a function (based off of the exclamation point). I don't know if that's going to be useful in debugging.
# The Lexer
import ply.lex as lex
tokens = ["MINUS", "SEPARATOR", "MODIFIER", "FUNCTION_NAME", "UNDEF_BLOCK", "VARIABLE_NAME", "EQUALS", "STRING", "FLOAT", "INNER_CONTENT", "ARGUMENTS", "INTEGER", "PLUS"]
def t_ARGUMENTS(t): # Finds arguments in calls and function definitions
r'\(.*\)'
t.value = t.value[1:-1] # strip parenthesis
t.value = t.value.split(" && ")
return t
def t_STRING(t): # finds strings
r'"\w.+"'
t.value = t.value[1:-1] # strips the quotation marks of the string
return t
def t_FLOAT(t): # finds floats
r'\d+.\d+'
t.value = float(t.value)
return t
def t_INTEGER(t):
r'\d+'
t.value = int(t.value)
return t
def t_VARIABLE_NAME(t):
r'\$\w*\b'
t.value = t.value[1:]
return t
def t_INNER_CONTENT(t):
r'\{\n.*\n\}|\{.*\}'
t.value = t.value[1:-1]
return t
def t_FUNCTION_NAME(t):
r'!\w+'
t.value = t.value[1:]
return t
t_ignore = r"\n|\t|\r"
t_EQUALS = r"\="
t_PLUS = r"\+"
t_MINUS = r"-"
t_MODIFIER = r"\."
t_SEPARATOR = r"\,"
t_UNDEF_BLOCK = r"\w+" # Any block of text that is left over and isn't assigned by the end (used by functions)
def t_error(t):
t.lexer.skip(1)
lex.lex()
#opened = open("example.zeq", "r")
#content = opened.read()
#opened.close()
#lex.input(content)
And then the Yacc half:
# The Yacc parser
import ply.yacc as yacc
import compiler # Get the compiler (tokenizer; compiler.py) which generates tokens
import sys
from os import system
##############
### IGNORE ###
tokens = compiler.tokens
#system("clear")
print("Executing "+sys.argv[1]+" |\n"+("-"*(len(sys.argv[1])+12)))
### IGNORE ###
##############
VARIABLES = {}
FUNCTIONS = {}
def p_assign(p): # Set new variable
'''assignment : VARIABLE_NAME EQUALS compound
| VARIABLE_NAME EQUALS STRING
| VARIABLE_NAME EQUALS INTEGER
| VARIABLE_NAME EQUALS FLOAT'''
#print("Setting '{}' to '{}'...".format(str(p[1]), str(p[3])))
VARIABLES[p[1]] = p[3]
def p_number(p): # Combines floats and integers into a blanket non-terminal for simplicity sakes
'''number : FLOAT
| INTEGER'''
p[0] = p[1]
def p_compound(p): # Complete the value *before* the variable is assigned!
'''compound : number PLUS number
| number MINUS number'''
type1 = type(p[1])
type2 = type(p[3])
operator = p[2]
if operator == "+":
p[0] = p[1] + p[3]
elif operator == "-":
p[0] = p[1] - p[3]
def p_undefined(p):
'''undefined : UNDEF_BLOCK'''
print("Undefined block")
def p_function(p):
'''function : FUNCTION_NAME INNER_CONTENT'''
print("Creating a function")
name = p[1]
content = p[2]
FUNCTIONS[name] = content
def p_empty(p):
'''empty : '''
#~ def p_error(p):
#~ if p:
#~ print("Syntax error: "+p.type)
#~ else:
#~ pass
parser = yacc.yacc()
opened = open(sys.argv[1], "r")
content = opened.read()
opened.close()
for line in content.splitlines():
parser.parse(line)
print(VARIABLES)
print(FUNCTIONS)
I'm waiting for it to be a simple overlooked detail...
When you ask Ply (or yacc, for that matter) to parse an input, it attempts to recognize a single instance of the top-level non-terminal (or "starting symbol"). This will usually a grammatical description of the entire input, so it will often have a name like program, although there are use cases in which it is useful to parse just a part of the input.
Ply (and yacc) assume that the first grammar production is for the starting symbol. In your case, the first production is assignment, and so that is what it will try to parse (and nothing else). assignment cannot derive a function definition or any other statement type, so those cause syntax errors.
If you want to explicitly tell Ply what the top-level symbol is, you can do so. See the manual section on starting symbols.
Related
I want to parse some C files for functions and a specific command.
My aim is to get all functions that use the specific command as well as all times the command gets called.
Thus I decided to generate extra token for that by using multiple conditions.
Here is my lexer and parser code:
import os
import ply.lex as lex
import ply.yacc as yacc
results = []
calls = []
# Declare the state
states = (
('func', 'exclusive'),
('parameter', 'exclusive'),
('httpgettext', 'exclusive')
)
reserved = {
'void': 'VOID',
'int': 'INT',
'uint8': 'UINT8',
'uint16': 'UINT16',
'uint32': 'UINT32',
'TRet': 'TRET',
'TBool': 'TBOOL',
'bool': 'BOOL',
}
tokens = [
'ID',
'FUNC',
'PARAMETERLIST',
'CALL',
'SEMICOLON'
] + list(reserved.values())
# Start of token description for INITIAL mode (inclusive)
def t_ID(t):
r'[a-zA-Z_][a-zA-Z_0-9]*'
t.type = reserved.get(t.value, 'ID')
return t
# Start of token description for HttpGetText condition
def t_httpgettext_SEMICOLON(t):
r'\;'
t.value = t.lexer.lexdata[t.lexer.call_start:t.lexer.lexpos-1]
t.type = 'CALL'
t.lexer.pop_state()
global calls
arguments = str(t.value).split(',')
calls.append([arguments[1], arguments[2]])
# Start of token description for parameter list condition
def t_parameter(t):
r'\('
t.lexer.parameter_start = t.lexer.lexpos
t.lexer.paren_level = 1
t.lexer.push_state('parameter')
def t_parameter_lparen(t):
r'\('
t.lexer.paren_level += 1
def t_parameter_rparen(t):
r'\)'
t.lexer.paren_level -= 1
if t.lexer.paren_level == 0:
t.value = t.lexer.lexdata[t.lexer.parameter_start:t.lexer.lexpos - 1]
t.type = 'PARAMETERLIST'
t.lexer.pop_state()
return t
# Start of token description for function block condition
def t_func(t):
r'\{'
t.lexer.code_start = t.lexer.lexpos # Record the starting position
t.lexer.brace_level = 1 # Initial brace level
t.lexer.push_state('func') # Enter 'ccode' state
# Rules for the ccode state
def t_func_lbrace(t):
r'\{'
t.lexer.brace_level += 1
def t_func_rbrace(t):
r'\}'
t.lexer.brace_level -= 1
if t.lexer.brace_level == 0:
t.value = t.lexer.lexdata[t.lexer.code_start:t.lexer.lexpos - 1]
t.type = "FUNC"
t.lexer.lineno += t.value.count('\n')
t.lexer.pop_state()
return t
# Start of token description valid for all conditions
t_ANY_ignore = " \t§$%&+#-_:.<<|',\0"
def t_ANY_HttpGetText(t):
r'HttpGetText'
t.lexer.call_start = t.lexer.lexpos
t.lexer.push_state('httpgettext')
# For bad characters, we just skip over it
def t_ANY_error(t):
t.lexer.skip(1)
def t_ANY_comment(t):
r'(/\*(.|\n)*?\*/)|(//.*)'
pass
def t_ANY_ignore_comments(t):
r'//.*'
pass
def t_ANY_newline(t):
r'\n+'
t.lexer.lineno += len(t.value)
lexer = lex.lex()
def p_statement_function(p):
'statement : type identifier parameter function'
p[0] = p[2]
global results
identifier = str(p[2])
parameter_list = str(p[3]).replace('\n', '')
function_block = str(p[4])
if function_block.find('HttpGetText') != -1:
results.append([identifier, parameter_list, function_block])
print(identifier)
# while True:
# tok = parser.token()
# print(tok)
# if not tok:
# break
def p_parameter_PARAMETERLIST(p):
'parameter : PARAMETERLIST'
p[0] = p[1]
def p_function_FUNC(p):
'function : FUNC'
p[0] = p[1]
def p_identifier_ID(p):
'identifier : ID '
p[0] = p[1]
def p_type_TBOOL(p):
'type : TBOOL'
p[0] = p[1]
def p_type_VOID(p):
'type : VOID'
p[0] = p[1]
def p_type_TRET(p):
'type : TRET'
p[0] = p[1]
def p_type_BOOL(p):
'type : BOOL'
def p_type_INT(p):
'type : INTEGER'
p[0] = p[1]
def p_INTEGER_INT(p):
'INTEGER : INT'
p[0] = p[1]
def p_INTEGER_UINT8(p):
'INTEGER : UINT8'
p[0] = p[1]
def p_INTEGER_UINT16(p):
'INTEGER : UINT16'
p[0] = p[1]
def p_INTEGER_UINT32(p):
'INTEGER : UINT32'
p[0] = p[1]
def p_error(p):
print('Syntax error in input: ', p)
parser.restart()
parser = yacc.yacc()
with open('C:/Users/z0046abb/Desktop/Bachelorarbeit/TextLibraryAnalysis/test_file.txt', 'r') as f:
read_data = f.read()
parser.parse(read_data)
print(results)
print(calls)
This is the content of my test_file.txt:
int main(argv)
{
HttpGetText(Arg1, Arg2, Arg3, Arg4);
return 0
}
void func2(bla, bla, bla)
{
something = random();
HttpGetText(1,2,3,4);
}
void func3(bla, bla, bla)
{
something = random();
HttpGetText(1,21,31,4);
}
void func4(bla, bla, bla)
{
HttpGetText(1, 22, 32, 4);
}
void func5(bla, bla, bla)
{
something();
}
void func6(bla)
{
HttpGetText(1, 23, 33, 4);
}
HttpGetText(1, 24, 34, 4);
HtppGetText(1, 25, 35, 4);
But somehow not all matches are found/processed.
This is the output of a test run:
main
Syntax error in input: LexToken(VOID,'void',12,75)
func3
Syntax error in input: LexToken(VOID,'void',30,243)
Syntax error in input: LexToken(VOID,'void',44,353)
[['main', 'argv', '\n HttpGetText(Arg1, Arg2, Arg3, Arg4);\n\n return 0\n'], ['func3', 'bla, bla, bla', '\n something = random();\n HttpGetText(1,21,31,4);\n']]
[[' Arg2', ' Arg3'], ['2', '3'], ['21', '31'], [' 22', ' 32'], [' 23', ' 33']]
As you can see there is a error at void despite it being a reserved token.
I am not sure if the problem is in my lexer or parser implementation.
If I use the 'lookahead' functionality (part of the function that is a comment) from p_statement_function(p): it seems like all token are correctly labeled.
However the above output only seems to identify main() and func3().
Additionally the last two lines of test_file.txt should be appended as well.
My first idea was to switch from t.lexer.begin(state) to t.lexer.push_state(state) so I could return to the last state the lexer had would help here but it doesn't seem so.
Now I ran out of ideas. It doesn't seem to fail because of the global lists I use to store results (I am aware that global vars are a bit risky).
In addition I am suprised by the fact that main() and func3() are found to be fitting matches other than the rest of implemented functions.
I would be happy if anyone of you has an idea for me.
Edit:
I tried to modify test_file.txt. If there is some nonsense word between every function I can record all functions in my global result list. Though this isn't the solution I wish for.
The immediate problem you have is that the starting point you have given your parser is statement. As it name suggests, that non-terminal matches a single statement (actually, a single function definition). It does not match anything more, so once the statement is complete, the parser expects to see the end of input. Any token will therefore be a syntax error. (Some of these error messages are suppressed by Ply's error recovery mechanism.)
To fix that, you need to add a start non-terminal which recognises a sequence of statements:
program : | program statement
The other relevant error is in your lexer. Since t_ID precedes t_ANY_HttpGetText, it takes priority. That means that in INITIAL state, the HttpGetText token is recognised as an ID something which should have been visible when you tested the scanner. I don't think this is necessarily serious since top-level function calls are illegal in C (even in initialisers of global variables). But you can easily fix it by reordering those two lexer rules.
In an answer to a previous question of yours which I think concerns the same project, I said:
Note that trying to do all this work in the lexer is not generally recommended. Lexers should normally return simple atomic tokens, leaving it to the parser's grammar to do the work of putting the tokens together into a useful structure.
I regret not having made this warning stronger. Although it may seem like a lot of work to correctly tokenise C programs, it's actually not that difficult, and there are lots of examples floating around. A full parse is complicated, but most of the complications have to do with declarations and simplifications are possible if you don't need all that information.
Alternatively, complete open source C parsing solutions exist. There's a certain learning curve associated with using them, but the payoff is the flexibility to analyse program structure systematically without having to immerse yourself in the quirks of C syntax. (These aspects are much more acute if the code you are analysing is C++.)
The following code works well when the text file are in sequence with code i.e, Introduction then Information but gives error if Information comes before Introduction. What would be the solution to handle this using lex/yacc? Thank in advance.
import ply.lex as lex
# List of token names. This is always required
tokens = [
'CheckupInformation',
'Introduction',
'Information',
'perfect',
'sick',
'LPAREN',
'RPAREN',
'CHAR',
'NUMBER'
]
def t_CheckupInformation(t) : 'CheckupInformation' ; return t
def t_Introduction(t) : 'Introduction' ; return t
def t_Information(t) : 'Information' ; return t
def t_perfect(t): 'perfect'; return t
def t_sick(t) : 'sick'; return t
t_LPAREN = r'\('
t_RPAREN = r'\)'
t_CHAR = r'[a-zA-Z_][a-zA-Z0-9_\-]*'
t_ignore = " \t"
# Define a rule so we can track line numbers
def t_NUMBER(t):
r'[+\-0-9_][0-9_]*'
t.lexer.lineno += len(t.value)
try:
t.value = int(t.value)
except ValueError:
print("Integer value too large %s" % t.value)
t.value = 0
return t
def t_SEMICOLON(t):
r'\;.*'
t.lexer.lineno += len(t.value)
def t_newline(t):
r'\n+'
t.lexer.lineno += len(t.value)
# Error handling rule
def t_error(t):
print("Illegal character '%s'" % t.value[0])
t.lexer.skip(1)
# Build the lexer
lexer = lex.lex()
# define upper level classes first
class stat:
def __init__(self):
self.statement = ""
self.intro = list()
self.body = list()
P=stat()
def p_stat(p):
'Stat : LPAREN CheckupInformation statIntro statBody RPAREN'
p[0]=(p[1],p[2],p[3],p[4],p[5])
def p_Intro(p) :
'''statIntro : LPAREN Introduction Name RPAREN
| statIntro LPAREN Introduction Name RPAREN
| empty'''
if len(p)==5:
p[0] = (p[3])
elif len(p)==6:
p[0] = (p[4])
else:
p[0]= None
P.intro.append(p[0])
def p_Name(p):
'Name : CHAR'
p[0]=p[1]
def p_Body(p):
'''statBody : LPAREN Information bodyinfo RPAREN
| statBody LPAREN Information bodyinfo RPAREN'''
if len(p)==5:
p[0] = (p[3])
elif len(p)==6:
p[0] = (p[4])
P.body.append(p[0])
def p_bodyinfo(p):
'''bodyinfo : LPAREN CHAR perfect RPAREN
| LPAREN CHAR sick RPAREN'''
p[0]=p[2],p[3]
def p_empty(p):
'empty : '
print("This function is called")
pass
def p_error(p):
print("Syntax error in input '%s'!" % p.value)
import ply.yacc as yacc
parser = yacc.yacc()
import sys
if len(sys.argv) < 2 :
sys.exit("Usage: %s <filename>" % sys.argv[0])
fp = open(sys.argv[1])
contents=fp.read()
result=parser.parse(contents)
print("(CheckupInformation")
if (P.intro) != None:
for x in range(len(P.intro)):
print(" (Introduction %s)" %(P.intro[x]))
for x in range(len(P.body)):
print(" (Information( %s %s))" %(P.body[x]))
print(")")
The code works well for file1 & cannot handle file2.
ERROR:
Syntax error in input '(Introduction'!
(CheckupInformation
(Introduction None)
(Information( Anonymous1 perfect))
)
File1:
(CheckupInformation
(Introduction John)
(Introduction Patt)
(Information(Anonymous1 perfect))
(Information(Anonymous2 sick))
)
File2:
(CheckupInformation
(Information(Anonymous1 perfect))
(Information(Anonymous2 sick))
(Introduction John)
(Introduction Patt)
)
This might not be the answer you wanted, but I found myself unable to just change one or two lines in your code. The following is still far from perfect, but I think it is approaching a reasonable approach to your problem. I tried to annotate it with useful comments. Please read through it carefully and try to understand why I did what I did, referring to the Ply manual as necessary (some references are in the code comments, but there's lots of useful background information in the document which I didn't reference specifically).
Good luck.
import ply.lex as lex
# Keyword handling copied from the Ply manual, https://www.dabeaz.com/ply/ply.html#ply_nn6
reserved = {
'CheckupInformation': 'TK_CheckupInformation',
'Introduction': 'TK_Introduction',
'Information': 'TK_Information',
'perfect': 'TK_perfect',
'sick': 'TK_sick',
}
# I changed CHAR to WORD because CHAR sounds like a character
tokens = ['NUMBER','WORD'] + list(reserved.values())
def t_WORD(t):
r'[a-zA-Z_][a-zA-Z0-9_-]*'
t.type = reserved.get(t.value,'WORD') # Check for reserved words
return t
# See the Ply manual: https://www.dabeaz.com/ply/ply.html#ply_nn11
literals = '()'
# See the Ply manual: https://www.dabeaz.com/ply/ply.html#ply_nn8
t_ignore = ' \t\n'
t_ignore_COMMENT = r'\;.*'
# Fixed the regex. You can't have a sign in the middle of a number.
def t_NUMBER(t):
r'[+-]?[0-9_]+'
try:
t.value = int(t.value)
except ValueError:
print("Integer value too large %s" % t.value)
t.value = 0
return t
# See below for the definition of lineno_for_token
# Error handling rule
def t_error(t):
print("Illegal character '%s' at line %d'" % (
t.value[0], t.lexer.lineno_for_token(t)))
t.lexer.skip(1)
# Build the lexer
lexer = lex.lex()
# Ply tracks the character index automatically as lexer.lexpos, and every
# token it produces has a lexpos attribute. So there is no need to misuse
# the lineno attribute for that purpose. It should be the line number of
# the token, as its name indicates.
# You don't seem to use lineno (or lexpos) anywhere, but it is handy for
# error messages. But since it is used rarely, it's easier to compute it
# on demand by counting newlines to the lex position.
# Perhaps this should just be added to the lexer we just built.
lex.Lexer.lineno_for_token = lambda self, t: 1 + self.lexdata.count('\n', 0, t.lexpos)
# Fixed this to use an upper-class name and to derive from object.
# Object to hold a top-level description
class Stat(object):
# Attributes used for components
components = {'intro', 'body'}
def __init__(self, component_dict):
self.statement = "" # I don't know what this is used for
# Copy the components dictionary as attributes, using
# an empty list as default
for k in self.components:
setattr(self, k, component_dict.get(k, ()))
# Verify that we used every key in the dict.
for k in component_dict.keys():
if k not in self.components:
print("Warning! Ignoring " + k
+ " because it is not in Stat.components")
# Arrange for the object to print as expected
def __repr__(self):
return '(CheckupInformation %r %r)' % (self.intro, self.body)
# Instead of having a global "P" object (whose name is not very useful),
# we return a Stat object
def p_stat(p):
""" stat : '(' TK_CheckupInformation components ')' """
p[0] = Stat(p[3])
# We allow all components to be optional and order independent here. We
# also allow them all to be repeated. But that could be made more precise.
# components is a dictionary whose values are lists
def p_components_empty(p):
""" components : """
p[0] = { }
def p_components_append(p):
""" components : components component """
p[0] = p[1]
# The component is a two-element tuple
key, value = p[2]
if key in p[0]:
p[0][key].append(value)
else:
p[0][key] = [value]
# Syntax for each component type (just one element, not a list)
# component is a tuple of (key, value)
# All of the productions just copy the value from some specific syntax.
def p_component(p):
""" component : statIntro
| statBody
"""
p[0] = p[1]
def p_statIntro(p):
"""statIntro : '(' TK_Introduction WORD ')' """
p[0] = ('intro', p[3])
def p_statBody(p):
"""statBody : '(' TK_Information bodyinfo ')' """
p[0] = ('body', p[3])
# bodyinfo is a tuple of (identifier, status)
def p_bodyinfo(p):
"""bodyinfo : '(' WORD TK_perfect ')'
| '(' WORD TK_sick ')'
"""
p[0] = (p[2],p[3])
def p_error(p):
print("Syntax error in input '%s'! at line %d" % (
p.value, p.lexer.lineno_for_token(p)))
import ply.yacc as yacc
parser = yacc.yacc()
# Only do this if we're called from the command line
if __name__ == "__main__":
import sys
if len(sys.argv) < 2 :
sys.exit("Usage: %s <filename>" % sys.argv[0])
with open(sys.argv[1]) as fp:
stat = parser.parse(fp.read())
if stat is not None:
print("(CheckupInformation")
for x in range(len(stat.intro)):
print(" (Introduction %s)" %(stat.intro[x]))
for x in range(len(stat.body)):
print(" (Information( %s %s))" %(stat.body[x]))
print(")")
I am reading the first example from
https://github.com/dabeaz/ply
It is a basic calculator allowing for only expression involving '(',')','+','-','*','/', integers and assignement (for instance x=3) and throwing the evaluation of the expression (even it it's result is not an integer, for instance '3/4').
I would like to allow for floating numbers, so that I basically modified the code from the example as follows but it doesn't work :
# -----------------------------------------------------------------------------
# calc.py
#
# A simple calculator with variables.
# -----------------------------------------------------------------------------
tokens = (
'NAME','INTEGER', 'FLOAT',
'PLUS','MINUS','TIMES','DIVIDE','EQUALS',
'LPAREN','RPAREN',
)
# Tokens
t_PLUS = r'\+'
t_MINUS = r'-'
t_TIMES = r'\*'
t_DIVIDE = r'/'
t_EQUALS = r'='
t_LPAREN = r'\('
t_RPAREN = r'\)'
t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*'
def t_INTEGER(t):
r'\d+'
t.value = int(t.value)
return t
def t_FLOAT(t):
r'/^(?!0\d)\d*(\.\d+)?$/mg'
t.value = float(t.value)
return t
# Ignored characters
t_ignore = " \t"
def t_newline(t):
r'\n+'
t.lexer.lineno += t.value.count("\n")
def t_error(t):
print("Illegal character '%s'" % t.value[0])
t.lexer.skip(1)
# Build the lexer
import ply.lex as lex
lex.lex()
# Precedence rules for the arithmetic operators
precedence = (
('left','PLUS','MINUS'),
('left','TIMES','DIVIDE'),
('right','UMINUS'),
)
# dictionary of names (for storing variables)
names = { }
def p_statement_assign(p):
'statement : NAME EQUALS expression'
names[p[1]] = p[3]
def p_statement_expr(p):
'statement : expression'
print(p[1])
def p_expression_binop(p):
'''expression : expression PLUS expression
| expression MINUS expression
| expression TIMES expression
| expression DIVIDE expression'''
if p[2] == '+' : p[0] = p[1] + p[3]
elif p[2] == '-': p[0] = p[1] - p[3]
elif p[2] == '*': p[0] = p[1] * p[3]
elif p[2] == '/': p[0] = p[1] / p[3]
def p_expression_uminus(p):
'expression : MINUS expression %prec UMINUS'
p[0] = -p[2]
def p_expression_group(p):
'expression : LPAREN expression RPAREN'
p[0] = p[2]
def p_expression_integer(p):
'expression : INTEGER'
p[0] = p[1]
def p_expression_float(p):
'expression : FLOAT'
p[0] = p[1]
def p_expression_name(p):
'expression : NAME'
try:
p[0] = names[p[1]]
except LookupError:
print("Undefined name '%s'" % p[1])
p[0] = 0
def p_error(p):
print("Syntax error at '%s'" % p.value)
import ply.yacc as yacc
yacc.yacc()
while True:
try:
s = input('calc > ')
except EOFError:
break
yacc.parse(s)
I have error :
calc > 3.14+1
Illegal character '.'
Syntax error at '14'
ply parses the T_xxx members in the order of declaration (using reflection on your module). What happens here is that T_INTEGER matches before T_FLOAT. So the integer part of your float is parsed, then ply chokes on the dot.
That would directly work if your regex for floats wasn't off (completely missed that point in my first answer, blinded by the obvious wrong order).
I've simplified it to \d+\.\d+ (which doesn't match 1. or .9 so not the best choice), but you can borrow a better one taken from a similar issue: PLY lexer for numbers always returns double
You have to get T_FLOAT parsed before T_INTEGER. Just swap both declarations to do so:
def t_FLOAT(t):
r'\d+\.\d+'
# a better regex taking exponents into account:
'[-+]?[0-9]+(\.([0-9]+)?([eE][-+]?[0-9]+)?|[eE][-+]?[0-9]+)'
t.value = float(t.value)
return t
def t_INTEGER(t):
r'\d+'
t.value = int(t.value)
return t
As a general rule for ply, do this for all patterns that are longer/more specific than others to avoid conflicts.
You have two problems in your lex file. First is the token order as explained by Jean-François: the longer tokens must be defined first in lex (ref from ply doc.):
When building the master regular expression, rules are added in the following order:
All tokens defined by functions are added in the same order as they appear in the lexer file.
Tokens defined by strings are added next by sorting them in order of decreasing regular expression length (longer expressions are added first).
But the string defining the token shall be a re compatible string. Your FLOAT definition is awfully broken here. If we define a float as composed of exactly one dot, and optional digits before or after the dot and not a dot alone, an acceptable definition could be:
r'(\d*\.\d+)|(\d+\.\d*)'
In particular, the shlashes / shall not be included in the string...
I am trying to process input file with description of algorithm behavior. I am using python's PLY module for defining lexer and parser. I stumbled upon problem of defining grammar which will enforce user to correctly write this file.
File
# Beginning of the first section
STATES = INITIATOR, IDLE, DONE;
INIT = INITIATOR, IDLE;
TERM = DONE;
# End of first section
# Beginning of the second section
INITIATOR
RANDOM
begin
SEND(x, NEIGHBORS);
BECOME(DONE);
end
IDLE
RECEIVE(x)
begin
SEND(x, NEIGHBORS);
BECOME(DONE);
end
# End of second section
Lexer
import ply.lex as lex
from soda.helpers import prepare_file
class Lexer(object):
keywords = (
'INIT', 'TERM', 'STATES', 'REGISTERS',
'begin', 'end',
'SEND', 'BECOME'
)
tokens = keywords + (
'NAME', 'EQUALS', 'COMMA', 'SEMICOLON',
'LPAREN', 'RPAREN'
)
# Tokens
t_EQUALS = r'='
t_COMMA = r','
t_SEMICOLON = r';'
t_STATES = r'STATES'
t_REGISTERS = r'REGISTERS'
t_INIT = r'INIT'
t_TERM = r'TERM'
t_begin = r'begin'
t_end = r'end'
t_SEND = r'SEND'
t_BECOME = r'BECOME'
t_LPAREN = r'\('
t_RPAREN = r'\)'
# Ignored characters
t_ignore = ' \t\n'
def t_NAME(self, t):
r'[a-zA-Z][a-zA-Z]*'
if t.value in self.keywords: # is this a keyword?
t.type = t.value
return t
def t_error(self, t):
print ("Illegal character {0} at line {1}".format(t.value[0], t.lineno))
t.lexer.skip(1)
def build(self, **kwargs):
self._lexer = lex.lex(module=self, **kwargs)
#prepare_file
def lexical_analysis(self, file):
print ("Started lexical analysis...")
for line in file:
try:
lex_input = line
except EOFError:
break
self._lexer.input(lex_input)
while True:
token = self._lexer.token()
if not token:
break
print (" ", token)
Parser
import ply.yacc as yacc
from soda.helpers import prepare_file
class Parser(object):
def p_algorithm(self, p):
''' algorithm : first_section second_section'''
def p_first_section(self, p):
''' first_section : STATES EQUALS states_list SEMICOLON
| REGISTERS EQUALS register_list SEMICOLON
| INIT EQUALS init_list SEMICOLON
| TERM EQUALS term_list SEMICOLON'''
def p_states_list(self, p):
''' states_list : state_term
| states_list COMMA state_term'''
def p_state_term(self, p):
''' state_term : NAME'''
self.behavior.states.append(p[1])
def p_register_list(self, p):
''' register_list : register_term
| register_list COMMA register_term'''
def p_register_term(self, p):
''' register_term : NAME'''
self.behavior.registers.append(p[1])
def p_init_list(self, p):
''' init_list : init_term
| init_list COMMA init_term'''
def p_init_term(self, p):
''' init_term : NAME'''
self.behavior.init_states.append(p[1])
def p_term_list(self, p):
''' term_list : term_term
| term_list COMMA term_term'''
def p_term_term(self, p):
''' term_term : NAME'''
self.behavior.term_states.append(p[1])
def p_second_section(self, p):
''' second_section : NAME begin commands end'''
def p_error(self, p):
print("Syntax error in input! -> {}".format(p))
def build(self, lexer, behavior):
self.lexer = lexer
self.behavior = behavior
self.tokens = lexer.tokens
self._parser = yacc.yacc(module=self)
#prepare_file
def parsing(self, file):
for line in file:
try:
parser_input = line
print (line)
except EOFError:
break
self._parser.parse(parser_input, lexer=self.lexer._lexer)
Parsing results in syntax error and I am not sure how to define rules to enforce the consistency of file with algorithm behavior. first_section is parsed ok and problem is second_section. My solution defines that algorithm : first_section second_section and it is not working. I tried to define it like algorithm: first_section | second_section and it works good but this rule states that first and second section can be switched in file.
So my question is how to enforce it with rules so user will keep the input file consistent.
Error output
enter STATES = INITIATOR, IDLE, DONE;
Syntax error in input! -> None
INIT = INITIATOR, IDLE;
Syntax error in input! -> None
TERM = DONE;
Syntax error in input! -> None
INITIATOR
Syntax error in input! -> LexToken(NAME,'INITIATOR',1,0)
begin
Syntax error in input! -> LexToken(begin,'begin',1,0)
Program just states there is error in syntax. Problem is not with lexical analysis but with defined grammar. I can define it in such way that input is accepted but for example user would be able to switch first_section with second_section.
Edit
I think it is not clear from this question what I want to achieve or my problem so I voted to close it. I came up with idea how to better state what I am looking for so I want to raise new question.
Oups! Your grammar parses the file line by line, which is at least uncommon and does not allow to control the ordering of lines. IMHO, you should parse the file as a whole. The trick is to pass the parser a tokenfunc function that will feed the lexer with one line at a time, and declare each section to be composed of lines:
class Parser(object):
def p_algorithm(self, p):
''' algorithm : first_section second_section'''
def p_first_section(self, p):
''' first_section : first_section_line
| first_section_line first_section'''
def p_first_section_line(self, p):
''' first_section_line : STATES EQUALS states_list SEMICOLON
| REGISTERS EQUALS register_list SEMICOLON
| INIT EQUALS init_list SEMICOLON
| TERM EQUALS term_list SEMICOLON'''
...
# same for second section...
#prepare_file
def parsing(self, file):
def get_token():
'a tokenizer that automatically feeds the lexer with the next line'
while True:
tok = self.lexer._lexer.token()
if tok is not None: return tok
try:
line = next(file)
self.lexer._lexer.input(line)
except StopIteration:
return None
self._parser.parse("", lexer=self.lexer._lexer, tokenfunc = get_token)
All,
I'm writing a very simplistic parser with python PLY. It mostly does the job, but for many of the lines of input, I get a Syntax error from yacc. Here is the lexer and parser code, slightly modified for easier testing:
tokens = ('VAR', 'NUMBER', 'CLOSE', 'JUNK')
# Tokens
t_VAR = r'%[mM]\['
t_CLOSE = r'\]'
t_JUNK = r'.'
# Ignored characters
t_ignore = " \t\r"
def t_NUMBER(t):
r'\d+'
try:
t.value = int(t.value)
except ValueError:
print("Integer value too large %d", t.value)
t.value = 0
return t
def t_newline(t):
r'\n+'
t.lexer.lineno += t.value.count("\n")
def t_error(t):
print("Illegal character '%s'" % t.value[0])
t.lexer.skip(1)
# Build the lexer
import ply.lex as lex
lex.lex()
# Parsing rules
def p_statement(p):
'''statement : field'''
try:
print p[1]
except IndexError:
pass
def p_trash(p):
'''statement : JUNK'''
pass
def p_field(p):
'''field : VAR NUMBER CLOSE'''
#print p[1], p[2], p[3]
p[0] = p[2]
def p_error(p):
print("Syntax error at '%s'" % repr(p)) #p.value)
import ply.yacc as yacc
yacc.yacc()
For a sample: yacc.parse('.set %m[702] $substr($currentlength,2,$currentpg)') which gives as output:
Syntax error at 'LexToken(JUNK,'s',1,1)'
Syntax error at 'LexToken(JUNK,'$',1,13)'
It should output 702 only.
Your top level rule requires a single statement. p_trash matches the first '.' and returns a statement and there is no top level rule to allow it to continue. You could do something like:
def p_junk(p):
'''statement | JUNK statement'''
You could also do something like this (and create a list of statements):
def p_statements(p):
'''statements | statement statements
| empty'''