pyparsing: how to parse nested function which start with particular function name? - python

I want to use pyparsing to parse a nested function which start with particular function name.
Just like this:
tag("tag_name_1", value_equal("proxy.province", "value", "return_value", test(1,2)))
The string waited to be parsed starts with the function named 'tag'.
The problem is that why exprStack doesn't contain "tag" function?
import pyparsing as pp
from typing import Any, List, Dict
def debug(*args, **kwargs):
print("debug"+"---"*10)
print(*args, **kwargs)
print("debug"+"---"*10)
# return "debug"
return "debug"
def insert_fn_argcount_tuple(t):
fn = t.pop(0)
num_args = len(t)
print((fn, num_args))
t.insert(0, (fn, num_args))
def push_first(toks):
exprStack.append(toks[0])
def to_string(toks):
pass
LPAREN, RPAREN, COMMA = map(pp.Suppress, '(),')
ident = pp.Word(pp.alphas, pp.alphanums+"_")
integer = pp.Word(pp.nums)
string = (pp.QuotedString("'") | pp.QuotedString('"')).setParseAction()
expr = pp.Forward()
expr_list = pp.delimitedList(pp.Group(expr))
tag_fn = ("tag" + LPAREN + expr_list + RPAREN).setParseAction(insert_fn_argcount_tuple)
fn_call = (ident + LPAREN + expr_list + RPAREN).setParseAction(insert_fn_argcount_tuple)
atom = ( (fn_call | string | integer) | pp.Group(LPAREN+expr+RPAREN)).addParseAction(push_first)
# atom = ( fn_call | pp.Group(LPAREN+expr+RPAREN)).addParseAction(push_first)
expr <<= atom
bnf = pp.Forward()
bnf <<= tag_fn
funcs = """tag
value_equal
value_contain
value_match
value
"""
# functions
def tag(tag_name: str, value:Any)->Dict:
if not tag_name or not value:
return {}
return {"tag_name": tag_name, "tag_value": value}
def test(*args, **kwargs):
return ""
def value_equal(key: str, value, default=None, test=None):
print(f"---{value_equal}---")
print(f"key: {key}, value: {value}, defaul: {default}, test:{test}")
return "value-1"
fns = {
"tag": tag,
"value_equal": value_equal,
"test": test
}
exprStack = []
def evaluate_stack(s:List): # List param will be changed after invoke evaluate_stack function
fn, arg_nums = s.pop(), 0
if isinstance(fn, tuple):
fn, arg_nums = fn
if fn in fns:
args = reversed([evaluate_stack(s) for _ in range(arg_nums)])
return fns[fn](*args)
else:
return fn
test_str = """tag("tag_name_1", value_equal("proxy.province", "value", "return_value", test(1,2)))"""
# test_str = "123"
p = bnf.parse_string(test_str)
print(f"\nexprStack:{exprStack}\n")
t = evaluate_stack(exprStack)
print(f"tag:{t}")
The output of above code is:
('test', 2)
('value_equal', 4)
('tag', 2)
exprStack:['tag_name_1', 'proxy.province', 'value', 'return_value', '1', '2', ('test', 2), ('value_equal', 4)\]
I expect that exprStack contains tag function. maybe like this:
exprStack:['tag_name_1', 'proxy.province', 'value', 'return_value', '1', '2', ('test', 2), ('value_equal', 4), ('tag', 2)\]

You are really pretty close. The thing is, the push_first parse action is attached to atoms, but tag_fn is not an atom. So it won't get its data pushed to expr_stack.
To fix this:
Change atom to include tag_fn, something like this:
atom = ((tag_fn | fn_call | string | integer) | pp.Group(LPAREN+expr+RPAREN)).addParseAction(push_first)
Change bnf to expr instead of tag_fn:
bnf <<= expr
With these two changes, I get this for expr_stack:
exprStack:['tag_name_1', 'proxy.province', 'value', 'return_value', '1', '2', ('test', 2), ('value_equal', 4), ('tag', 2)]

Related

How to remove values from file if N/A in beam

Just looking for advice, if I have a file like below. What function do I use in apache beam to remove any N/A value. I tried filter but it removes the whole row but I just want to remove that 'cell' if its N/A... I first read the file in and then split the rows using a template split row class which is called using pardo now want remove any N/A values...
Example file
Start_loc, Loc_2, loc_3, loc_4, end_loc
Loc 1, loc 2, N/A, loc 3, loc 4
Loc 1, N/A, N/A, N/A, loc 2
Any suggestions?
I had tried the below before seeing these answers. It works, I am just aware it might not be the 'right' way of doing it... any feedback on the below?
class Split(beam.DoFn):
def process(self, element):
"""
Splits each row on commas and returns a dictionary representing the
row
"""
Start_loc, loc_1, loc_2, loc_3, loc_4 ,End_loc = element.split(",")
return [{
'Start_loc': Start_loc,
'Loc_1': loc_1,
'Loc_2': loc_2,
'Loc_3': loc_3,
'Loc_4': loc_4,
'End_loc': End_loc
}]
class CleanFile(beam.DoFn):
def process(self, element):
for k, v in list(element.items()):
if v == 'N/A':
element[k] = None
return [{
'Start_loc': element['Start_loc'],
'loc_1': element['Loc_1'],
'loc_2': element['Loc_2'],
'loc_3': element['Loc_3'],
'loc_4': element['Loc_4'],
'End_loc': element['End_Loc']
}]
class CombineColumns(beam.DoFn):
def process(self, element):
"""
Prepares each row to be written in the csv
"""
other_loc = ''
for k, v in list(element.items()):
if v != None and k != 'Start_loc' and k != 'End_loc' and other_loc == '':
other_loc = '"' + v
elif v != None and k != 'Start_loc' and k != 'End_loc':
other_loc = other_loc + ',' + v
other_loc = other_loc + '"'
return [{
'Start_loc': element['Start_loc'],
'Other_loc': other_loc,
'End_loc': element['End_loc']
}]
class WriteToCSV(beam.DoFn):
def process(self, element):
"""
Prepares each row to be written in the csv
"""
result = [
"{},{},{}".format(
element['Start_loc'],
element['Other_loc'],
element['End_loc']
)
]
return result
def process_file():
pipeline_options = PipelineOptions()
user_options = pipeline_options.view_as(UserOptions)
tstmp = datetime.now().strftime("%Y%m%d%H")
input = user_options.input
output = user_options.output
with beam.Pipeline(options=pipeline_options) as p:
ROWS = p | 'Read from a File' >> beam.io.ReadFromText(input, skip_header_lines=1) | beam.ParDo(Split())
CLEAN = ROWS | beam.ParDo(CleanFile())
FORMAT = CLEAN | beam.ParDo(CombineColumns())
FORMAT |beam.ParDo(WriteToCSV()) |beam.io.WriteToText(output, file_name_suffix=".csv", header='Start_loc,Other_loc,End_loc')
I tried it out using the Create method to create a PCollection and then applied a ParDo on it to get the desired result. I have assumed that you wanted to replace the N/A string with an empty String.
the DoFn reads the string splits it based on the delimiter "," and replaces the "N/A" with "" before adding them to a list. After that it returns the list values separated by the delimiter.
import apache_beam as beam
class FilterFn(beam.DoFn):
def __init__(self, delimiter, filter_item ):
self.delimiter = delimiter
self.filter_item = filter_item
def process(self, text):
a_list = []
for word in text.split(self.delimiter):
if( word.strip() == self.filter_item): # Replace the condition and output to your Requirment
a_list.append("")
else:
a_list.append(word)
# print(",".join(a_list))
yield self.delimiter.join(a_list)
with beam.Pipeline() as pipeline:
plants = (
pipeline
| 'Create Dummy Input' >> beam.Create([ "Loc 1,loc 2,N/A,loc 3,loc 4"])
| 'Split words and Remove N/A' >> beam.ParDo(FilterFn(',', 'N/A'))
| beam.Map(print)#Do Further Processing
)
The output String that I'm getting after this ParDo is
Loc 1,loc 2,,loc 3,loc 4

DRF dynamic filtering

I need dynamic filtering in DRF that should allow using parenthesis for defining operations precedence and use any combination of the available fields in model.
Operations are: and, or, eq (equal), ne (not equal), gt (greater than), lt (less than)
example: "(date eq '2016-05-01') AND ((number_of_calories gt 20) OR (number_of_calories lt 10))"
How can I achieve this? what is best way?
Currently I have below solution but it's not good approach as it's vulnerable to SQL Injection:
utils.py
mappings = {
' eq ': ' = ',
' ne ': ' != ',
' gt ': ' > ',
' lt ': ' < ',
' gte ': ' >= ',
' lte ': ' <= ',
}
def convert_string(query: str) -> Optional[str]:
if query and isinstance(query, str):
pattern_drop = re.compile(r"drop\s+table\s*\w*")
pattern_alter = re.compile(r"alter\s+table\s+\w+")
pattern_delete = re.compile(r"delete\s+from\s+\w+")
pattern_update = re.compile(r"update\s+\w+\s+set\s+\w+")
pattern_insert = re.compile(r"insert\s+into\s+\w+")
pattern_select = re.compile(r"select\s+\w+\s+from\s+")
query_lower = query.lower()
if '--' in query_lower or '/*' in query_lower or \
pattern_drop.match(query_lower) or pattern_alter.match(query_lower) or \
pattern_update.match(query_lower) or pattern_insert.match(query_lower) or \
pattern_delete.match(query_lower) or pattern_select.match(query_lower):
return None
for expression, operation in mappings.items():
query = query.replace(expression, operation)
return query
views.py
def get_queryset(self):
q_string = self.request.data['query']
# q_string = "(date eq '2016-05-01') AND ((number_of_calories gt 20) OR (number_of_calories lt 10))"
query = convert_string(q_string)
# just replace 'eq' with '=', 'ne' with '!=', and so on ...
# query = "(date = '2016-05-01') AND ((number_of_calories > 20) OR (number_of_calories < 10))"
users = Users.objects.raw('SELECT * FROM Users WHERE ' + query)
return users
For parsing a query string like:
string = "((num_of_pages gt 20) OR (num_of_pages lt 10)) AND (date gt '2016-05-01')"
you can use the pyparsing package (not an expert but very powerful library) with django Q objects:
parsing code:
import pyparsing as pp
import operator as op
from django.db.models import Q
word = pp.Word(pp.alphas, pp.alphanums + "_-*'")
operator = pp.oneOf('lt gt eq').setResultsName('operator')
number = pp.pyparsing_common.number()
quoted = pp.quotedString().setParseAction(pp.removeQuotes)
term = (word | number | quoted)
key = term.setResultsName('key')
value = term.setResultsName('value')
group = pp.Group(key + operator + value)
def q_item(item):
"""Helper for create django Q() object"""
k = f'{item.key}__{item.operator}'
v = item.value
return Q(**{k: v})
class BaseBinary:
def __init__(self, tokens):
self.args = tokens[0][0::2]
def __repr__(self):
return f'{self.__class__.__name__}({self.symbol}):{self.args}'
def evaluate(self):
a = q_item(self.args[0]) if not isinstance(self.args[0], BaseBinary) else self.args[0].evaluate()
b = q_item(self.args[1]) if not isinstance(self.args[1], BaseBinary) else self.args[1].evaluate()
return self.op(a, b)
class BoolNotOp(BaseBinary):
symbol = 'NOT'
op = op.not_
def __init__(self, tokens):
super().__init__(tokens)
self.args = tokens[0][1]
def evaluate(self):
a = q_item(self.args) if not isinstance(self.args, BaseBinary) else self.args.evaluate()
return ~a
class BoolAndOp(BaseBinary):
symbol = 'AND'
op = op.and_
class BoolOrOp(BaseBinary):
symbol = 'OR'
op = op.or_
expr = pp.infixNotation(group,
[('NOT', 1, pp.opAssoc.RIGHT, BoolNotOp),
('AND', 2, pp.opAssoc.LEFT, BoolAndOp),
('OR', 2, pp.opAssoc.LEFT, BoolOrOp)])
Now given a string like:
string = "(date gt '2016-05-01') AND ((num_of_pages gt 20) OR (num_of_pages lt 10))"
to the parser:
parser = expr.parseString(string)[0]
print(parser.evaluate())
give us our Q objects:
(AND: ('date__gt', '2016-05-01'), (OR: ('num_of_pages__gt', 20), ('num_of_pages__lt', 10)))
ready to be filtered
class Book(models.Model):
title = models.CharField(max_length=200)
counter = models.PositiveIntegerField(default=0)
date = models.DateField(auto_now=True)
num_of_pages = models.PositiveIntegerField(default=0)
qs = Book.objects.filter(parser.evaluate())
print(qs.query)
SELECT "core_book"."id", "core_book"."title", "core_book"."counter", "core_book"."date", "core_book"."num_of_pages" FROM "core_book" WHERE ("core_book"."date" > 2016-05-01 AND ("core_book"."num_of_pages" > 20 OR "core_book"."num_of_pages" < 10))
P.S not fully tested.
I currently use the Q object extensively in a project I have that is using the users get parameters to filter a search result.
Here is a snippet
some_initial_query_object = Model.objects.all()
qs_result_dates = []
qs_result_dates.append(
Q(
event_date__start_date_time__gte='2021-08-01',
event_date__start_date_time__lt='2021-09-01' + datetime.timedelta(days=1)
)
)
some_initial_query_object = some_initial_query_object.filter(qs_result_dates)
In your scenario you can use | for OR and & for AND
Q(date='2016-05-01')
&
Q(number_of_calories__gt=20, number_of_calories__lt=10)
Here is an example of dynamic filtering using DRF by overriding the get_queryset method on the ModelViewSet that I use in all of my projects. Using this method I can leverage the full power of Django-ORM framework from the frontend.
views.py
def BaseAPIView(...):
''' base view for other views to inherit '''
def get_queryset(self):
queryset = self.queryset
# get filter request from client:
filter_string = self.request.query_params.get('filter')
# apply filters if they are passed in:
if filters:
filter_dictionary = json.loads(filter_string)
queryset = queryset.filter(**filter_dictionary)
return queryset
The request url will now look like, for example: my_website.com/api/users?filter={"first_name":"John"}
Which can be built like:
script.js
// using ajax as an example:
var filter = JSON.stringify({
"first_name" : "John"
});
$.ajax({
"url" : "my_website.com/api/users?filter=" + filter,
"type" : "GET",
...
});
Some advantages:
no need to specify which fields can be filtered on each view class
write it once, use it everywhere
front end filtering looks exactly like django filtering
can do the same with exclude
Some disadvantages:
potential security risks if you want some fields to be non-filterable
less intuitive front-end code to query a table
Overall, this approach has been far more useful for me than any packages out there.

Get correct brace grouping from string

I have files with incorrect JSON that I want to start fixing by getting it into properly grouped chunks.
The brace grouping {{ {} {} } } {{}} {{{}}} should already be correct
How can I grab all the top-level braces, correctly grouped, as separate strings?
If you don't want to install any extra modules simple function will do:
def top_level(s):
depth = 0
start = -1
for i, c in enumerate(s):
if c == '{':
if depth == 0:
start = i
depth += 1
elif c == '}' and depth:
depth -= 1
if depth == 0:
yield s[start:i+1]
print(list(top_level('{{ {} {} } } {{}} {{{}}}')))
Output:
['{{ {} {} } }', '{{}}', '{{{}}}']
It will skip invalid braces but could be easily modified to report an error when they are spotted.
Using the regex module:
In [1]: import regex
In [2]: braces = regex.compile(r"\{(?:[^{}]++|(?R))*\}")
In [3]: braces.findall("{{ {} {} } } {{}} {{{}}}")
Out[3]: ['{{ {} {} } }', '{{}}', '{{{}}}']
pyparsing can be really helpful here. It will handle pathological cases where you have braces inside strings, etc. It might be a little tricky to do all of this work yourself, but fortunately, somebody (the author of the library) has already done the hard stuff for us.... I'll reproduce the code here to prevent link-rot:
# jsonParser.py
#
# Implementation of a simple JSON parser, returning a hierarchical
# ParseResults object support both list- and dict-style data access.
#
# Copyright 2006, by Paul McGuire
#
# Updated 8 Jan 2007 - fixed dict grouping bug, and made elements and
# members optional in array and object collections
#
json_bnf = """
object
{ members }
{}
members
string : value
members , string : value
array
[ elements ]
[]
elements
value
elements , value
value
string
number
object
array
true
false
null
"""
from pyparsing import *
TRUE = Keyword("true").setParseAction( replaceWith(True) )
FALSE = Keyword("false").setParseAction( replaceWith(False) )
NULL = Keyword("null").setParseAction( replaceWith(None) )
jsonString = dblQuotedString.setParseAction( removeQuotes )
jsonNumber = Combine( Optional('-') + ( '0' | Word('123456789',nums) ) +
Optional( '.' + Word(nums) ) +
Optional( Word('eE',exact=1) + Word(nums+'+-',nums) ) )
jsonObject = Forward()
jsonValue = Forward()
jsonElements = delimitedList( jsonValue )
jsonArray = Group(Suppress('[') + Optional(jsonElements) + Suppress(']') )
jsonValue << ( jsonString | jsonNumber | Group(jsonObject) | jsonArray | TRUE | FALSE | NULL )
memberDef = Group( jsonString + Suppress(':') + jsonValue )
jsonMembers = delimitedList( memberDef )
jsonObject << Dict( Suppress('{') + Optional(jsonMembers) + Suppress('}') )
jsonComment = cppStyleComment
jsonObject.ignore( jsonComment )
def convertNumbers(s,l,toks):
n = toks[0]
try:
return int(n)
except ValueError, ve:
return float(n)
jsonNumber.setParseAction( convertNumbers )
Phew! That's a lot ... Now how do we use it? The general strategy here will be to scan the string for matches and then slice those matches out of the original string. Each scan result is a tuple of the form (lex-tokens, start_index, stop_index). For our use, we don't care about the lex-tokens, just the start and stop. We could do: string[result[1], result[2]] and it would work. We can also do string[slice(*result[1:])] -- Take your pick.
results = jsonObject.scanString(testdata)
for result in results:
print '*' * 80
print testdata[slice(*result[1:])]

Syntax error of Python PLY parser

I'm writing simplified MODULA-2 grammar using Python PLY.
But I'm getting syntax error:
$ python3 m2.py
Syntax error at 'MODULE'
and I cannot figure out what is the problem with rules.
Here is the grammar:
import ply.lex as lex
import ply.yacc as yacc
# =============================================================================
# Lexer rules
# =============================================================================
tokens = (
# Keywords
'RETURN', 'IF', 'THEN', 'VAR', 'MODULE', 'BEGIN', 'END',
# Contants
'NUMBER',
# Operators
'PLUS', 'MINUS', 'TIMES', 'DIV', 'MOD', 'ASSIGN_OP',
# Separators
'LPAR', 'RPAR', 'PERIOD', 'COLON', 'SEMICOLON',
# Identifier
'IDENT',
)
# Tokens
t_NUMBER = r'\d+'
t_PLUS = r'\+'
t_MINUS = r'-'
t_TIMES = r'\*'
t_LPAR = r'\('
t_RPAR = r'\)'
t_PERIOD = r'\.'
t_COLON = r':'
t_SEMICOLON = r';'
t_ASSIGN_OP = r':='
t_IDENT = r'[a-zA-Z][a-zA-Z0-9]*'
# Ignored characters
t_ignore = ' \t'
def t_error(t):
print("Illegal character '%s'" % t.value[0])
t.lexer.skip(1)
# Build the lexer
lexer = lex.lex()
# =============================================================================
# Parser rules
# =============================================================================
precedence = (
('left', 'PLUS', 'MINUS'),
('left', 'TIMES', 'DIV'),
)
def p_add_operator(t):
""" add_operator : PLUS
| MINUS
"""
pass
def p_mul_operator(t):
""" mul_operator : TIMES
| DIV
| MOD
"""
pass
def p_simple_expression(t):
""" expression : term
| expression add_operator term
"""
pass
def p_term(t):
""" term : factor
| term mul_operator factor
"""
pass
def p_factor(t):
""" factor : NUMBER
| IDENT
| LPAR expression RPAR
"""
pass
def p_statement(t):
""" statement : IDENT
| IDENT ASSIGN_OP expression
| IF expression THEN statement_sequence END
| RETURN expression
"""
pass
def p_statement_sequence(t):
""" statement_sequence : statement
| statement_sequence SEMICOLON statement
"""
pass
def p_block(t):
""" block : declaration_list BEGIN statement_sequence END
"""
pass
def p_declaration_list(t):
""" declaration_list : declaration
| declaration_list declaration
"""
pass
def p_declaration(t):
""" declaration : VAR IDENT COLON IDENT SEMICOLON
"""
pass
def p_program_module(t):
""" program_module : MODULE IDENT SEMICOLON block IDENT PERIOD
"""
pass
def p_error(t):
print("Syntax error at '%s'" % t.value)
parser = yacc.yacc(start='program_module')
if __name__ == "__main__":
s = "MODULE test; VAR x: INTEGER; BEGIN x := 10 END test."
parser.parse(s)
The interesting thing is that the same grammar rules written for lex/yacc are working fine. Can somebody help me with this?
AFAIK, ply.lex has not enough magic to know that you want that the special MODULE word to be the token MODULE.
With your definition, the simple test:
lexer.input("MODULE test; VAR x: INTEGER; BEGIN x := 10 END test.")
for tok in lexer:
print(tok)
outputs:
LexToken(IDENT,'MODULE',1,0)
LexToken(IDENT,'test',1,7)
LexToken(SEMICOLON,';',1,11)
LexToken(IDENT,'VAR',1,13)
LexToken(IDENT,'x',1,17)
LexToken(COLON,':',1,18)
LexToken(IDENT,'INTEGER',1,20)
LexToken(SEMICOLON,';',1,27)
LexToken(IDENT,'BEGIN',1,29)
LexToken(IDENT,'x',1,35)
LexToken(ASSIGN_OP,':=',1,37)
LexToken(NUMBER,'10',1,40)
LexToken(IDENT,'END',1,43)
LexToken(IDENT,'test',1,47)
LexToken(PERIOD,'.',1,51)
The correct way to process keywords is to identify them inside the IDENT token:
=============================================================================
# Lexer rules
# =============================================================================
# Keywords
keywords = ( 'RETURN', 'IF', 'THEN', 'VAR', 'MODULE', 'BEGIN', 'END' )
tokens = keywords + (
# Contants
'NUMBER',
...
and
def t_IDENT(t):
r'[a-zA-Z][a-zA-Z0-9]*'
if t.value in keywords: # is this a keyword
t.type = t.value
return t
The same lexer control now correctly gives:
LexToken(MODULE,'MODULE',1,0)
LexToken(IDENT,'test',1,7)
LexToken(SEMICOLON,';',1,11)
LexToken(VAR,'VAR',1,13)
LexToken(IDENT,'x',1,17)
LexToken(COLON,':',1,18)
LexToken(IDENT,'INTEGER',1,20)
LexToken(SEMICOLON,';',1,27)
LexToken(BEGIN,'BEGIN',1,29)
LexToken(IDENT,'x',1,35)
LexToken(ASSIGN_OP,':=',1,37)
LexToken(NUMBER,'10',1,40)
LexToken(END,'END',1,43)
LexToken(IDENT,'test',1,47)
LexToken(PERIOD,'.',1,51)
and the parsing shows no error.

Parse C-like declarations using pyparsing

I would like to parse declarations using pyparsing in a C-like source (GLSL code) such that I get a list of (type, name, value).
For example:
int a[3];
int b=1, c=2.0;
float d = f(z[2], 2) + 3*g(4,a), e;
Point f = {1,2};
I would like to obtain something like:
[ ('int', 'a[3]', ''),
('int', 'b', '1'),
('int', 'c', '2.0'),
('float', 'd', 'f(z[2], 2) + 3*g(4,a)'),
('float', 'e', ''),
('Point', 'f', '{1,2}') ]
I've played with Forward() and operatorPrecedence() to try to parse the rhs expression but I suspect it is not necessary in my case.
So far I have:
IDENTIFIER = Regex('[a-zA-Z_][a-zA-Z_0-9]*')
INTEGER = Regex('([+-]?(([1-9][0-9]*)|0+))')
EQUAL = Literal("=").suppress()
SEMI = Literal(";").suppress()
SIZE = INTEGER | IDENTIFIER
VARNAME = IDENTIFIER
TYPENAME = IDENTIFIER
VARIABLE = Group(VARNAME.setResultsName("name")
+ Optional(EQUAL + Regex("[^,;]*").setResultsName("value")))
VARIABLES = delimitedList(VARIABLE.setResultsName("variable",listAllMatches=True))
DECLARATION = (TYPENAME.setResultsName("type")
+ VARIABLES.setResultsName("variables", listAllMatches=True) + SEMI)
code = """
float a=1, b=3+f(2), c;
float d=1.0, e;
float f = z(3,4);
"""
for (token, start, end) in DECLARATION.scanString(code):
for variable in token.variable:
print token.type, variable.name, variable.value
but the last expression (f=z(3,4)) is not parsed because of the ,.
There is a C struct parser on the pyparsing wiki that might give you a good start.
This seems to work.
IDENTIFIER = Word(alphas+"_", alphas+nums+"_" )
INT_DECIMAL = Regex('([+-]?(([1-9][0-9]*)|0+))')
INT_OCTAL = Regex('(0[0-7]*)')
INT_HEXADECIMAL = Regex('(0[xX][0-9a-fA-F]*)')
INTEGER = INT_HEXADECIMAL | INT_OCTAL | INT_DECIMAL
FLOAT = Regex('[+-]?(((\d+\.\d*)|(\d*\.\d+))([eE][-+]?\d+)?)|(\d*[eE][+-]?\d+)')
LPAREN, RPAREN = Literal("(").suppress(), Literal(")").suppress()
LBRACK, RBRACK = Literal("[").suppress(), Literal("]").suppress()
LBRACE, RBRACE = Literal("{").suppress(), Literal("}").suppress()
SEMICOLON, COMMA = Literal(";").suppress(), Literal(",").suppress()
EQUAL = Literal("=").suppress()
SIZE = INTEGER | IDENTIFIER
VARNAME = IDENTIFIER
TYPENAME = IDENTIFIER
OPERATOR = oneOf("+ - * / [ ] . & ^ ! { }")
PART = nestedExpr() | nestedExpr('{','}') | IDENTIFIER | INTEGER | FLOAT | OPERATOR
EXPR = delimitedList(PART, delim=Empty()).setParseAction(keepOriginalText)
VARIABLE = (VARNAME("name") + Optional(LBRACK + SIZE + RBRACK)("size")
+ Optional(EQUAL + EXPR)("value"))
VARIABLES = delimitedList(VARIABLE.setResultsName("variables",listAllMatches=True))
DECLARATION = (TYPENAME("type") + VARIABLES + SEMICOLON)
code = """
int a[3];
int b=1, c=2.0;
float d = f(z[2], 2) + 3*g(4,a), e;
Point f = {1,2};
"""
for (token, start, end) in DECLARATION.scanString(code):
vtype = token.type
for variable in token.variables:
name = variable.name
size = variable.size
value = variable.value
s = "%s / %s" % (vtype,name)
if size: s += ' [%s]' % size[0]
if value: s += ' / %s' % value[0]
s += ";"
print s

Categories