SimpleParse not showing the result tree - python

I am working on the Google ProtoBuff where I am trying to parse the proto file using SimpleParse in python.
I am using EBNF format with SimpleParse, it shows success but there is nothing in the result Tree, not sure what is going wrong. Any help would really be appreciated.
Following is the grammar file:
proto ::= ( message / extend / enum / import / package / option / ';' )*
import ::= 'import' , strLit , ';'
package ::= 'package' , ident , ( '.' , ident )* , ';'
option ::= 'option' , optionBody , ';'
optionBody ::= ident , ( '.' , ident )* , '=' , constant
message ::= 'message' , ident , messageBody
extend ::= 'extend' , userType , '{' , ( field / group / ';' )* , '}'
enum ::= 'enum' , ident , '{' , ( option / enumField / ';' )* , '}'
enumField ::= ident , '=' , intLit , ';'
service ::= 'service' , ident , '{' , ( option / rpc / ';' )* , '}'
rpc ::= 'rpc' , ident , '(' , userType , ')' , 'returns' , '(' , userType , ')' , ';'
messageBody ::= '{' , ( field / enum / message / extend / extensions / group / option / ':' )* , '}'
group ::= label , 'group' , camelIdent , '=' , intLit , messageBody
field ::= label , type , ident , '=' , intLit , ( '[' , fieldOption , ( ',' , fieldOption )* , ']' )? , ';'
fieldOption ::= optionBody / 'default' , '=' , constant
extensions ::= 'extensions' , extension , ( ',' , extension )* , ';'
extension ::= intLit , ( 'to' , ( intLit / 'max' ) )?
label ::= 'required' / 'optional' / 'repeated'
type ::= 'double' / 'float' / 'int32' / 'int64' / 'uint32' / 'uint64' / 'sint32' / 'sint64' / 'fixed32' / 'fixed64' / 'sfixed32' / 'sfixed64' / 'bool' / 'string' / 'bytes' / userType
userType ::= '.'? , ident , ( '.' , ident )*
constant ::= ident / intLit / floatLit / strLit / boolLit
ident ::= [A-Za-z_],[A-Za-z0-9_]*
camelIdent ::= [A-Z],[\w_]*
intLit ::= decInt / hexInt / octInt
decInt ::= [1-9],[\d]*
hexInt ::= [0],[xX],[A-Fa-f0-9]+
octInt ::= [0],[0-7]+
floatLit ::= [\d]+ , [\.\d+]?
boolLit ::= 'true' / 'false'
strLit ::= quote ,( hexEscape / octEscape / charEscape / [^\0\n] )* , quote
quote ::= ['']
hexEscape ::= [\\],[Xx],[A-Fa-f0-9]
octEscape ::= [\\0]? ,[0-7]
charEscape ::= [\\],[abfnrtv\\\?'']
And this is the python code that I am experimenting with:
from simpleparse.parser import Parser
from pprint import pprint
protoGrammar = ""
protoInput = ""
protoGrammarRoot = "proto"
with open ("proto_grammar.ebnf", "r") as grammarFile:
protoGrammar=grammarFile.read()
with open("sample.proto", "r") as protoFile:
protoInput = protoFile.read().replace('\n', '')
parser = Parser(protoGrammar,protoGrammarRoot)
success, resultTree, newCharacter = parser.parse(protoInput)
pprint(protoInput)
pprint(success)
pprint(resultTree)
pprint(newCharacter)
and this the proto file that I am trying to parse
message AmbiguousMsg {
optional string mypack_ambiguous_msg = 1;
optional string mypack_ambiguous_msg1 = 1;
}
I get the output as
1
[]
0

I am new to Python but I came up with this, although I am not entirely sure of your output format. Hopefully this will point you in the right direction. Feel free to modify the code below to cater your requirements.
#!/usr/bin/python
# (c) 2015 enthusiasticgeek for StackOverflow. Use the code in anyway you want but leave credits intact. Also use this code at your own risk. I do not take any responsibility for your usage - blame games and trolls will strictly *NOT* be tolerated.
import re
#data_types=['string','bool','enum','int32','uint32','int64','uint64','sint32','sint64','bytes','string','fixed32','sfixed32','float','fixed64','sfixed64','double']
#function # 1
#Generate list of units in the brackets
#================ tokens based on braces ====================
def find_balanced_braces(args):
parts = []
for arg in args:
if '{' not in arg:
continue
chars = []
n = 0
for c in arg:
if c == '{':
if n > 0:
chars.append(c)
n += 1
elif c == '}':
n -= 1
if n > 0:
chars.append(c)
elif n == 0:
parts.append(''.join(chars).lstrip().rstrip())
chars = []
elif n > 0:
chars.append(c)
return parts
#function # 2
#================ Retrieve Nested Levels ====================
def find_nested_levels(test, count_level):
count_level=count_level+1
level = find_balanced_braces(test)
if not bool(level):
return count_level-1
else:
return find_nested_levels(level,count_level)
#function # 3
#================ Process Nested Levels ====================
def process_nested_levels(test, count_level):
count_level=count_level+1
level = find_balanced_braces(test)
print "===== Level = " + str(count_level) + " ====="
for i in range(len(level)):
#print level[i] + "\n"
exclusive_level_messages = ''.join(level[i]).split("message")[0]
exclusive_level_messages_tokenized = ''.join(exclusive_level_messages).split(";")
#print exclusive_level_messages + "\n"
for j in range(len(exclusive_level_messages_tokenized)):
pattern = exclusive_level_messages_tokenized[j].lstrip()
print pattern
#match = "\message \s*(.*?)\s*\{"+pattern
#match_result = re.findall(match, level[i])
#print match_result
print "===== End Level ====="
if not bool(level):
return count_level-1
else:
return process_nested_levels(level,count_level)
#============================================================
#=================================================================================
test_string=("message a{ optional string level-i1.l1.1 = 1 [default = \"/\"]; "
"message b{ required bool level-i1.l2.1 = 1; required fixed32 level-i1.l2.1 = 2; "
"message c{ required string level-i1.l3.1 = 1; } "
"} "
"} "
"message d{ required uint64 level-i2.l1.1 = 1; required double level-i2.l1.2 = 2; "
"message e{ optional double level-i2.l2.1 = 1; "
"message f{ optional fixed64 level-i2.l3.1 = 1; required fixed32 level-i2.l3.2 = 2; "
"message g{ required bool level-i2.l4.1 = 2; } "
"} "
"} "
"} "
"message h{ required uint64 level-i3.l1.1 = 1; required double level-i3.l1.2 = 2; }")
#Right now I do not see point in replacing \n with blank space
with open ("fileproto.proto", "r") as myfile:
data=myfile.read().replace('\n', '\n')
print data
count_level=0
#replace 'data' in the following line with 'test_string' for tests
nested_levels=process_nested_levels([data],count_level)
print "Total count levels depth = " + str(nested_levels)
print "========================\n"
My output looks as follows
// This defines protocol for a simple server that lists files.
//
// See also the nanopb-specific options in fileproto.options.
message ListFilesRequest {
optional string path = 1 [default = "/"];
}
message FileInfo {
required uint64 inode = 1;
required string name = 2;
}
message ListFilesResponse {
optional bool path_error = 1 [default = false];
repeated FileInfo file = 2;
}
===== Level = 1 =====
optional string path = 1 [default = "/"]
required uint64 inode = 1
required string name = 2
optional bool path_error = 1 [default = false]
repeated FileInfo file = 2
===== End Level =====
===== Level = 2 =====
===== End Level =====
Total count levels depth = 1
========================
NOTE After print pattern you can tokenize further if necessary by taking pattern as in input. I have commented one example with regex.

Related

Python How to scan for a certain letter in a input

So, I'm making some code that I want it to do a if statement that looks through the input and if it sees a certain character will then go print a message.
password = input("password: ")
if(password == "scan for a certain letter or something"):
print("Please use don't use these smybols: / \ ; : , . > < ? ! # # $ % ^ & * ( ) [ ] { } | ")
it dosen't have to be a if statement if that is what it turns out to be, I just want to know how to do it.
You can use the in operator with the any() function:
prohibited = set("/ \ ; : , . > < ? ! # # $ % ^ & * ( ) [ ] { } | ]".split())
password = input("password: ")
if any(char in prohibited for char in password):
print(f"Please use don't use these symbols: {prohibited}")
I think it will be more convenient for the user to get an exhaustive list of invalid characters used by him so that he can correct the input:
unacceptable_symbols = r"/\;:,.><?!##$%^&*()[]{}| "
password = r"f;kjad%9203)29e2" # test string
lst = [x for x in password if x in unacceptable_symbols] # makes list of all used unacceptable symbols
if len(lst) > 0:
print("You have used the following unacceptable characters: " + ' '.join(lst))
print("Please don't use these symbols: " + ' '.join(list(unacceptable_symbols)))
# Output:
# You have used the following unacceptable characters: ; % )
# Please don't use these symbols: / \ ; : , . > < ? ! # # $ % ^ & * ( ) [ ] { } |
I have tried to implement this code with very simple logic. And its working perfectly.
s="/ ; : , . > < ? ! # # $ % ^ & * ( ) [ ] { } | \ "
s1=s.split()
s3=input("enter the string: ")
for i in s3:
if(i in s1):
print("please dont use the given symbol:",i)

Get correct brace grouping from string

I have files with incorrect JSON that I want to start fixing by getting it into properly grouped chunks.
The brace grouping {{ {} {} } } {{}} {{{}}} should already be correct
How can I grab all the top-level braces, correctly grouped, as separate strings?
If you don't want to install any extra modules simple function will do:
def top_level(s):
depth = 0
start = -1
for i, c in enumerate(s):
if c == '{':
if depth == 0:
start = i
depth += 1
elif c == '}' and depth:
depth -= 1
if depth == 0:
yield s[start:i+1]
print(list(top_level('{{ {} {} } } {{}} {{{}}}')))
Output:
['{{ {} {} } }', '{{}}', '{{{}}}']
It will skip invalid braces but could be easily modified to report an error when they are spotted.
Using the regex module:
In [1]: import regex
In [2]: braces = regex.compile(r"\{(?:[^{}]++|(?R))*\}")
In [3]: braces.findall("{{ {} {} } } {{}} {{{}}}")
Out[3]: ['{{ {} {} } }', '{{}}', '{{{}}}']
pyparsing can be really helpful here. It will handle pathological cases where you have braces inside strings, etc. It might be a little tricky to do all of this work yourself, but fortunately, somebody (the author of the library) has already done the hard stuff for us.... I'll reproduce the code here to prevent link-rot:
# jsonParser.py
#
# Implementation of a simple JSON parser, returning a hierarchical
# ParseResults object support both list- and dict-style data access.
#
# Copyright 2006, by Paul McGuire
#
# Updated 8 Jan 2007 - fixed dict grouping bug, and made elements and
# members optional in array and object collections
#
json_bnf = """
object
{ members }
{}
members
string : value
members , string : value
array
[ elements ]
[]
elements
value
elements , value
value
string
number
object
array
true
false
null
"""
from pyparsing import *
TRUE = Keyword("true").setParseAction( replaceWith(True) )
FALSE = Keyword("false").setParseAction( replaceWith(False) )
NULL = Keyword("null").setParseAction( replaceWith(None) )
jsonString = dblQuotedString.setParseAction( removeQuotes )
jsonNumber = Combine( Optional('-') + ( '0' | Word('123456789',nums) ) +
Optional( '.' + Word(nums) ) +
Optional( Word('eE',exact=1) + Word(nums+'+-',nums) ) )
jsonObject = Forward()
jsonValue = Forward()
jsonElements = delimitedList( jsonValue )
jsonArray = Group(Suppress('[') + Optional(jsonElements) + Suppress(']') )
jsonValue << ( jsonString | jsonNumber | Group(jsonObject) | jsonArray | TRUE | FALSE | NULL )
memberDef = Group( jsonString + Suppress(':') + jsonValue )
jsonMembers = delimitedList( memberDef )
jsonObject << Dict( Suppress('{') + Optional(jsonMembers) + Suppress('}') )
jsonComment = cppStyleComment
jsonObject.ignore( jsonComment )
def convertNumbers(s,l,toks):
n = toks[0]
try:
return int(n)
except ValueError, ve:
return float(n)
jsonNumber.setParseAction( convertNumbers )
Phew! That's a lot ... Now how do we use it? The general strategy here will be to scan the string for matches and then slice those matches out of the original string. Each scan result is a tuple of the form (lex-tokens, start_index, stop_index). For our use, we don't care about the lex-tokens, just the start and stop. We could do: string[result[1], result[2]] and it would work. We can also do string[slice(*result[1:])] -- Take your pick.
results = jsonObject.scanString(testdata)
for result in results:
print '*' * 80
print testdata[slice(*result[1:])]

Parse C-like declarations using pyparsing

I would like to parse declarations using pyparsing in a C-like source (GLSL code) such that I get a list of (type, name, value).
For example:
int a[3];
int b=1, c=2.0;
float d = f(z[2], 2) + 3*g(4,a), e;
Point f = {1,2};
I would like to obtain something like:
[ ('int', 'a[3]', ''),
('int', 'b', '1'),
('int', 'c', '2.0'),
('float', 'd', 'f(z[2], 2) + 3*g(4,a)'),
('float', 'e', ''),
('Point', 'f', '{1,2}') ]
I've played with Forward() and operatorPrecedence() to try to parse the rhs expression but I suspect it is not necessary in my case.
So far I have:
IDENTIFIER = Regex('[a-zA-Z_][a-zA-Z_0-9]*')
INTEGER = Regex('([+-]?(([1-9][0-9]*)|0+))')
EQUAL = Literal("=").suppress()
SEMI = Literal(";").suppress()
SIZE = INTEGER | IDENTIFIER
VARNAME = IDENTIFIER
TYPENAME = IDENTIFIER
VARIABLE = Group(VARNAME.setResultsName("name")
+ Optional(EQUAL + Regex("[^,;]*").setResultsName("value")))
VARIABLES = delimitedList(VARIABLE.setResultsName("variable",listAllMatches=True))
DECLARATION = (TYPENAME.setResultsName("type")
+ VARIABLES.setResultsName("variables", listAllMatches=True) + SEMI)
code = """
float a=1, b=3+f(2), c;
float d=1.0, e;
float f = z(3,4);
"""
for (token, start, end) in DECLARATION.scanString(code):
for variable in token.variable:
print token.type, variable.name, variable.value
but the last expression (f=z(3,4)) is not parsed because of the ,.
There is a C struct parser on the pyparsing wiki that might give you a good start.
This seems to work.
IDENTIFIER = Word(alphas+"_", alphas+nums+"_" )
INT_DECIMAL = Regex('([+-]?(([1-9][0-9]*)|0+))')
INT_OCTAL = Regex('(0[0-7]*)')
INT_HEXADECIMAL = Regex('(0[xX][0-9a-fA-F]*)')
INTEGER = INT_HEXADECIMAL | INT_OCTAL | INT_DECIMAL
FLOAT = Regex('[+-]?(((\d+\.\d*)|(\d*\.\d+))([eE][-+]?\d+)?)|(\d*[eE][+-]?\d+)')
LPAREN, RPAREN = Literal("(").suppress(), Literal(")").suppress()
LBRACK, RBRACK = Literal("[").suppress(), Literal("]").suppress()
LBRACE, RBRACE = Literal("{").suppress(), Literal("}").suppress()
SEMICOLON, COMMA = Literal(";").suppress(), Literal(",").suppress()
EQUAL = Literal("=").suppress()
SIZE = INTEGER | IDENTIFIER
VARNAME = IDENTIFIER
TYPENAME = IDENTIFIER
OPERATOR = oneOf("+ - * / [ ] . & ^ ! { }")
PART = nestedExpr() | nestedExpr('{','}') | IDENTIFIER | INTEGER | FLOAT | OPERATOR
EXPR = delimitedList(PART, delim=Empty()).setParseAction(keepOriginalText)
VARIABLE = (VARNAME("name") + Optional(LBRACK + SIZE + RBRACK)("size")
+ Optional(EQUAL + EXPR)("value"))
VARIABLES = delimitedList(VARIABLE.setResultsName("variables",listAllMatches=True))
DECLARATION = (TYPENAME("type") + VARIABLES + SEMICOLON)
code = """
int a[3];
int b=1, c=2.0;
float d = f(z[2], 2) + 3*g(4,a), e;
Point f = {1,2};
"""
for (token, start, end) in DECLARATION.scanString(code):
vtype = token.type
for variable in token.variables:
name = variable.name
size = variable.size
value = variable.value
s = "%s / %s" % (vtype,name)
if size: s += ' [%s]' % size[0]
if value: s += ' / %s' % value[0]
s += ";"
print s

python parsley - domain name syntax

I'm trying to get a limited syntax for domain names to work. The syntax is defined at https://www.rfc-editor.org/rfc/rfc1035 Section 2.3.1. A subset of it is as under
<label> ::= <letter> [ [ <ldh-str> ] <let-dig> ]
<ldh-str> ::= <let-dig-hyp> | <let-dig-hyp> <ldh-str>
<let-dig-hyp> ::= <let-dig> | "-"
<let-dig> ::= <letter> | <digit>
<letter> ::= any one of the 52 alphabetic characters A through Z in upper case and a through z in lower case
<digit> ::= any one of the ten digits 0 through 9
My attempt below. I'm trying to match label
from parsley import makeGrammar
import ometa
domain = makeGrammar('''
letdighyp = (letterOrDigit|-)
label = letterOrDigit letdighyp+ letterOrDigit
''', {})
tests = ('abcd1000',)
for t in tests:
try:
print domain(t).label()
except ometa.runtime.ParseError as e:
print 'parse failed for', t
print e
running that gives me
parse failed for abcd1000
abcd1000
^
Parse error at line 2, column 0: expected EOF. trail: [digit letdig letdighyp]
What am I doing wrong ?
P.S.
label = letterOrDigit letdighyp+ letterOrDigit
is the line I'm not able to get working. It matches the string if the last letterOrDigit isn't there.
Try:
from parsley import makeGrammar
import ometa
def check_the_rest(s):
if '-' in s:
return s[-1].isalnum()
return True
domain = makeGrammar('''
letdighyp = (letterOrDigit|'-')
# label = <letterOrDigit (letdighyp* letterOrDigit)*>
label = <letter the_rest>
the_rest = letdighyp*:r ?(check_the_rest(r)) -> r
''', dict(check_the_rest=check_the_rest))
tests = ('a', 'abcd1000', 'a-',)
for t in tests:
try:
print('✓', domain(t).label())
except ometa.runtime.ParseError as e:
print('parse failed for', t)
print(e)
(The last test should fail)
I'm using the py3 branch by vsajip.

How to get pyparser to work in a particular form

Sorry for the sorry title. I could not think of anything better
I am trying to implement a DSL with pyparsing that has the following requirements:
varaibles: All of them begin with v_
Unary operators: +, -
Binary operators: +,-,*,/,%
Constant numbers
Functions, like normal functions when they have just one variable
Functions need to work like this: foo(v_1+v_2) = foo(v_1) + foo(v_2), foo(bar(10*v_6))=foo(bar(10))*foo(bar(v_6)). It should be the case for any binary operation
I am able to get 1-5 working
This is the code I have so far
from pyparsing import *
exprstack = []
#~ #traceParseAction
def pushFirst(tokens):
exprstack.insert(0,tokens[0])
# define grammar
point = Literal( '.' )
plusorminus = Literal( '+' ) | Literal( '-' )
number = Word( nums )
integer = Combine( Optional( plusorminus ) + number )
floatnumber = Combine( integer +
Optional( point + Optional( number ) ) +
Optional( integer )
)
ident = Combine("v_" + Word(nums))
plus = Literal( "+" )
minus = Literal( "-" )
mult = Literal( "*" )
div = Literal( "/" )
cent = Literal( "%" )
lpar = Literal( "(" ).suppress()
rpar = Literal( ")" ).suppress()
addop = plus | minus
multop = mult | div | cent
expop = Literal( "^" )
band = Literal( "#" )
# define expr as Forward, since we will reference it in atom
expr = Forward()
fn = Word( alphas )
atom = ( ( floatnumber | integer | ident | ( fn + lpar + expr + rpar ) ).setParseAction(pushFirst) |
( lpar + expr.suppress() + rpar ))
factor = Forward()
factor << atom + ( ( band + factor ).setParseAction( pushFirst ) | ZeroOrMore( ( expop + factor ).setParseAction( pushFirst ) ) )
term = factor + ZeroOrMore( ( multop + factor ).setParseAction( pushFirst ) )
expr << term + ZeroOrMore( ( addop + term ).setParseAction( pushFirst ) )
print(expr)
bnf = expr
pattern = bnf + StringEnd()
def test(s):
del exprstack[:]
bnf.parseString(s,parseAll=True)
print exprstack
test("avg(+10)")
test("v_1+8")
test("avg(v_1+10)+10")
Here is the what I want.
My functions are of this type:
foo(v_1+v_2) = foo(v_1) + foo(v_2)
The same behaviour is expected for any other binary operation as well. I have no idea how to make the parser do this automatically.
Break out the function call as a separate sub expression:
function_call = fn + lpar + expr + rpar
Then add a parse action to function_call that pops the operators and operands from expr_stack, then pushes them back onto the stack:
if an operand, push operand then function
if an operator, push the operator
Since you are only doing binary operations, you might be better off doing a simple approach first:
expr = Forward()
identifier = Word(alphas+'_', alphanums+'_')
expr = Forward()
function_call = Group(identifier + LPAR + Group(expr) + RPAR)
unop = oneOf("+ -")
binop = oneOf("+ - * / %")
operand = Group(Optional(unop) + (function_call | number | identifier))
binexpr = operand + binop + operand
expr << (binexpr | operand)
bnf = expr
This gives you a simpler structure to work with, without having to mess with exprstack.
def test(s):
exprtokens = bnf.parseString(s,parseAll=True)
print exprtokens
test("10")
test("10+20")
test("avg(10)")
test("avg(+10)")
test("column_1+8")
test("avg(column_1+10)+10")
Gives:
[['10']]
[['10'], '+', ['20']]
[[['avg', [['10']]]]]
[[['avg', [['+', '10']]]]]
[['column_1'], '+', ['8']]
[[['avg', [['column_1'], '+', ['10']]]], '+', ['10']]
You want to expand fn(a op b) to fn(a) op fn(b), but fn(a) should be left alone, so you need to test on the length of the parsed expression argument:
def distribute_function(tokens):
# unpack function name and arguments
fname, args = tokens[0]
# if args contains an expression, expand it
if len(args) > 1:
ret = ParseResults([])
for i,a in enumerate(args):
if i % 2 == 0:
# even args are operands to be wrapped in the function
ret += ParseResults([ParseResults([fname,ParseResults([a])])])
else:
# odd args are operators, just add them to the results
ret += ParseResults([a])
return ParseResults([ret])
function_call.setParseAction(distribute_function)
Now your calls to test will look like:
[['10']]
[['10'], '+', ['20']]
[[['avg', [['10']]]]]
[[['avg', [['+', '10']]]]]
[['column_1'], '+', ['8']]
[[[['avg', [['column_1']]], '+', ['avg', [['10']]]]], '+', ['10']]
This should even work recursively with a call like fna(fnb(3+2)+fnc(4+9)).

Categories