pyparsing nested structure not working as expected - python

I'm trying to parse a simple JSON-like structure into python dics and then turn it into a proper JSON structure. The block is as follows:
###################################################
# HEADER TEXT
# HEADER TEXT
###################################################
NAME => {
NAME => VALUE,
NAME => VALUE,
NAME => VALUE,
NAME => {
NAME => {
NAME => VALUE, NAME => VALUE, NAME => VALUE,
},
} # comment
}, # more comments
and repeating. Rules:
NAME = alphanums and _
VALUE = decimal(6) | hex (0xA) | list of hex ([0x1,0x2]) | text in brackets([A]) | string("A")
I set up the following grammar:
cfgName = Word(alphanums+"_")
cfgString = dblQuotedString().setParseAction(removeQuotes)
cfgNumber = Word("0123456789ABCDEFx")
LBRACK, RBRACK, LBRACE, RBRACE = map(Suppress, "[]{}")
EQUAL = Literal('=>').suppress()
cfgObject = Forward()
cfgValue = Forward()
cfgElements = delimitedList(cfgValue)
cfgArray = Group(LBRACK + Optional(cfgElements, []) + RBRACK)
cfgValue << (cfgString | cfgNumber | cfgArray | cfgName | Group(cfgObject))
memberDef = Group(cfgName + EQUAL + cfgValue)
cfgMembers = delimitedList(memberDef)
cfgObject << Dict(LBRACE + Optional(cfgMembers) + RBRACE)
cfgComment = pythonStyleComment
cfgObject.ignore(cfgComment)
EDIT: I've managed to isolate the problem. Proper JSON is
{member,member,member}
however my structure is:
{member,member,member,}
the last element in every nested structure is comma separated and I don't know how to account for that in the grammar.

Related

Cannot maintain focus on element in Selenium Python driver

I am trying to control the web by python to run a script and download the corresponding csv file.
Here is how the web page looks like with a dashboard menu to click the "Search" other button. Once clicked on Search button it shows a Search text box where one can enter a code and press enter to run.
Now I need to find the element of this Search box. From Inspect in Chrome, looks like below:
So I used the following code. I also used Actions to keep the focus on search box before I copy the code from a text file and send it to that search box.
def run_code():
""" Function to copy the code in Search and run it
"""
search_button=driver.find_element_by_link_text("Search")
search_button.click()
time.sleep(2)
with open('data_download_code.txt', 'r') as f:
code_file= f.read()
content_box=driver.find_element_by_class_name("ace_content")
# Getting the focus on the element
actions=ActionChains(driver)
actions.move_to_element(content_box)
actions.click()
content_box.send_keys(code_file,Keys.ENTER)
#content_box.submit()
However it throws an error of focus not on element.
I am not sure if I got the right element selector for Search from the attached html file, or it is just some focus issue. I did use Actions class there to get the focus.
I want the code to read the text in the txt file and send it to the search box and press enter to run it.
WebDriverException: Message: unknown error: cannot focus element
(Session info: chrome=71.0.3578.98)
EDIT: Extra html details for selector
Edit 2:
Edit 3:
So I am able to get the element for Search and it is able to copy the code from a txt file and enter in search box but I see it is not able to copy the whole code correctly hence gives an error. Pls see attached full code and how much got copied.
sourcetype=perf_log_bizx
(host=pc*bcf* OR host=pc*bsfapi* OR servername=pc*bcf* OR servername=pc*bsfapi*) OR
(host=sc*bcf* OR host=sc*bsfapi* OR servername=sc*bcf* OR servername=sc*bsfapi*) OR
(host=ac*bcf* OR host=ac*bsfapi* OR servername=ac*bcf* OR servername=ac*bsfapi*) OR
NOT "/perfLogServlet" NOT "REQ-\[*" earliest="12/18/2018:08:00:00" latest="12/18/2018:12:00:00"
| rex field=_raw "\[(?<client_ip>[\d\.]+)\]\s+\[(?<company_id>[^,]+),(?<company_name>[^,]+),(?<company_schema>[\w\.]+),(?<dbpool>[^,]+),(?<user_id>[^,]+),\S+\]\s+\S+\s+\S+\s+(?<render_time>\d+)\s(?<server_time>\d+)\s(?<end2end_time>\d+)\s+\S+\s\S+\s\[.*\]\s+\d+\-(?<call_id>\d+)\s+(?<module_id>(-|\w+))\s(?<page_name>(-|\w+))\s(?<page_qualifier>(-|\w+))"
| rex field=_raw "\[\[(?<MemoryAllocated>\d+)\w+\s+(?<CPUTimeTotal>\d+)\w+\s+(?<CPUTimeUser>\d+)\w+\s+(?<CPUTimeSystem>\d+)\w+\s+(?<FileRead>\d+)\w+\s+(?<FileWrite>\d+)\w+\s+(?<NetworkRead>\d+)\w+\s+(?<NetworkWrite>\d+)\w+\s+(?<NotClosedFiles>(\d+|-))\s+(?<NotClosedSockets>(\d+|-))\s+\]\]\s+(?<SQLInvokes>\d+)\s+(?<SQLTimeTotal>\d+)"
| eval company_id = ifnull(CMID, company_id)
| eval dbpool = ifnull(DPN, dbpool)
| eval company_schema =ifnull(SN, company_schema)
| eval user_id = ifnull(UID, user_id)
| eval module_id = ifnull(MID, module_id)
| eval page_name = ifnull(PID, page_name)
| eval page_qualifier = ifnull(PQ, page_qualifier)
| rex field=CAID "\d+\-(?<CAID>\d+)"
| eval call_id = ifnull(CAID, call_id)
| eval render_time = ifnull(RDT, render_time)
| eval server_time = ifnull(SVT, server_time)
| eval end2end_time = ifnull(EET, end2end_time)
| eval MemoryAllocated = ifnull(MEM, MemoryAllocated)
| eval CPUTimeTotal = ifnull(CPU, CPUTimeTotal)
| eval CPUTimeUser = ifnull(UCPU, CPUTimeUser)
| eval CPUTimeSystem = ifnull(SCPU, CPUTimeSystem)
| eval FileRead = ifnull(FRE, FileRead)
| eval FileWrite = ifnull(FWR, FileWrite)
| eval NetworkRead = ifnull(NRE, NetworkRead)
| eval NetworkWrite = ifnull(NWR, NetworkWrite)
| eval NotClosedFiles = ifnull(0, NotClosedFiles)
| eval NotClosedSockets = ifnull(0, NotClosedSockets)
| eval SQLInvokes = ifnull(SQLC, SQLInvokes)
| eval SQLTimeTotal = ifnull(SQLT, SQLTimeTotal)
| eval request_type = if(call_id=0,"Root", "Subaction")
| search call_id = 0 AND page_name!="NONE"
| eval full_page_name = module_id + "-" + page_name + "-" + page_qualifier + " [" + request_type + "]"
| eval has_open_sockets = if ( ifnull(NotClosedSockets,0) > 0, 1, 0)
| eval has_open_files = if ( ifnull(NotClosedFiles,0) > 0, 1, 0)
| eval time = strftime( _time, "%Y-%m-%d %H:%M:%S" )
| eval server = ifnull(servername, host)
| rex field=server "\w(?<dc>\d+)\w"
| eval dc_name = "DC" + tostring(dc)
| eval server_type = if (substr(server, 1, 2) = "pc", "PROD", if (substr(server, 1, 2) = "sc", "PREVIEW", if (substr(server, 1, 2) = "ac", "QA", "OTHER") ) )
| eval dc_company_user = dc + "|" + company_id + "|" + sha256( user_id )
| table
time,
dc_name,
server_type,
dbpool,
company_id,
full_page_name,
dc_company_user,
server_time,
end2end_time,
SQLInvokes,
SQLTimeTotal,
MemoryAllocated[![enter image description here][6]][6]
Edit4:
The code read from the txt file is also reading \n. So the string has \n in it and I guess that might be causing issues when sent to the WebDriver to run in the search box. Possible to read the code as it is in above edit?
'sourcetype=perf_log_bizx\n(host=pc*bcf* OR host=pc*bsfapi* OR servername=pc*bcf* OR servername=pc*bsfapi*) OR\n(host=sc*bcf* OR host=sc*bsfapi* OR servername=sc*bcf* OR servername=sc*bsfapi*) OR\n(host=ac*bcf* OR host=ac*bsfapi* OR servername=ac*bcf* OR servername=ac*bsfapi*) OR\nNOT "/perfLogServlet" NOT "REQ-\\[*" earliest="12/18/2018:08:00:00" latest="12/18/2018:12:00:00" \n \n | rex field=_raw "\\[(?<client_ip>[\\d\\.]+)\\]\\s+\\[(?<company_id>[^,]+),(?<company_name>[^,]+),(?<company_schema>[\\w\\.]+),(?<dbpool>[^,]+),(?<user_id>[^,]+),\\S+\\]\\s+\\S+\\s+\\S+\\s+(?<render_time>\\d+)\\s(?<server_time>\\d+)\\s(?<end2end_time>\\d+)\\s+\\S+\\s\\S+\\s\\[.*\\]\\s+\\d+\\-(?<call_id>\\d+)\\s+(?<module_id>(-|\\w+))\\s(?<page_name>(-|\\w+))\\s(?<page_qualifier>(-|\\w+))"\n | rex field=_raw "\\[\\[(?<MemoryAllocated>\\d+)\\w+\\s+(?<CPUTimeTotal>\\d+)\\w+\\s+(?<CPUTimeUser>\\d+)\\w+\\s+(?<CPUTimeSystem>\\d+)\\w+\\s+(?<FileRead>\\d+)\\w+\\s+(?<FileWrite>\\d+)\\w+\\s+(?<NetworkRead>\\d+)\\w+\\s+(?<NetworkWrite>\\d+)\\w+\\s+(?<NotClosedFiles>(\\d+|-))\\s+(?<NotClosedSockets>(\\d+|-))\\s+\\]\\]\\s+(?<SQLInvokes>\\d+)\\s+(?<SQLTimeTotal>\\d+)"\n \n | eval company_id = ifnull(CMID, company_id)\n | eval dbpool = ifnull(DPN, dbpool)\n | eval company_schema =ifnull(SN, company_schema)\n | eval user_id = ifnull(UID, user_id)\n \n | eval module_id = ifnull(MID, module_id)\n | eval page_name = ifnull(PID, page_name)\n | eval page_qualifier = ifnull(PQ, page_qualifier)\n \n | rex field=CAID "\\d+\\-(?<CAID>\\d+)"\n | eval call_id = ifnull(CAID, call_id)\n \n | eval render_time = ifnull(RDT, render_time)\n | eval server_time = ifnull(SVT, server_time)\n | eval end2end_time = ifnull(EET, end2end_time)\n | eval MemoryAllocated = ifnull(MEM, MemoryAllocated)\n | eval CPUTimeTotal = ifnull(CPU, CPUTimeTotal)\n | eval CPUTimeUser = ifnull(UCPU, CPUTimeUser)\n | eval CPUTimeSystem = ifnull(SCPU, CPUTimeSystem)\n | eval FileRead = ifnull(FRE, FileRead)\n | eval FileWrite = ifnull(FWR, FileWrite)\n | eval NetworkRead = ifnull(NRE, NetworkRead)\n | eval NetworkWrite = ifnull(NWR, NetworkWrite)\n | eval NotClosedFiles = ifnull(0, NotClosedFiles)\n | eval NotClosedSockets = ifnull(0, NotClosedSockets)\n | eval SQLInvokes = ifnull(SQLC, SQLInvokes)\n | eval SQLTimeTotal = ifnull(SQLT, SQLTimeTotal)\n \n | eval request_type = if(call_id=0,"Root", "Subaction")\n \n| search call_id = 0 AND page_name!="NONE"\n \n | eval full_page_name = module_id + "-" + page_name + "-" + page_qualifier + " [" + request_type + "]"\n | eval has_open_sockets = if ( ifnull(NotClosedSockets,0) > 0, 1, 0)\n | eval has_open_files = if ( ifnull(NotClosedFiles,0) > 0, 1, 0)\n | eval time = strftime( _time, "%Y-%m-%d %H:%M:%S" )\n | eval server = ifnull(servername, host)\n | rex field=server "\\w(?<dc>\\d+)\\w"\n | eval dc_name = "DC" + tostring(dc)\n | eval server_type = if (substr(server, 1, 2) = "pc", "PROD", if (substr(server, 1, 2) = "sc", "PREVIEW", if (substr(server, 1, 2) = "ac", "QA", "OTHER") ) )\n | eval dc_company_user = dc + "|" + company_id + "|" + sha256( user_id )\n \n| table\n time,\n dc_name,\n server_type,\n dbpool,\n company_id,\n full_page_name,\n dc_company_user,\n server_time,\n end2end_time,\n SQLInvokes,\n SQLTimeTotal,\n MemoryAllocated'
You should send keys to input field, but not to parent div. Try below instead:
content_box = driver.find_element_by_css_selector("div.ace_content input")
content_box.send_keys(code_file, Keys.ENTER)
or
content_box = driver.find_element_by_class_name('ace_text-input')
content_box.send_keys(code_file, Keys.ENTER)
Also note that most likely you won't need to use Actions
content_box=driver.find_element_by_class_name("ace_content")
this code will result in content_box being a "div" element. you can't send keys to a div element. inspect that div to find a "textarea" or "input" element, and set that to your content_box.
On top of #Andersson's answer (which you should accept btw, he did solve your problem ;) let me help you with stripping the \n from the source text. This code:
with open('data_download_code.txt', 'r') as f:
code_file= f.read()
, the read() method, returns the raw value of the file, with the EOL (end-of-line) characters intact. This though:
code_file = f.read.splitlines()
, will return it (in code_file) as a list of strings, each list member a line in the file. Now the question is - what to replace the EOL chars with? I'm not familiar with the language that's in it, so it's up to you to decide.
Say it is a semicolon, ;, this is how to transform the list back into a string:
code_file = ';'.join(code_file)
This will concatenate all list members in a single string, using that character as delimiter. Naturally, you just replace the char with whatever is applicable:
code_file = ' '.join(code_file) # a whitespace character
code_file = '\t'.join(code_file) # a tab
code_file = '\\n'.join(code_file) # a literal newline
code_file = 'whatever?'.join(code_file) # you name it
So the final form is:
with open('data_download_code.txt', 'r') as f:
code_file= f.readlines()
code_file = ';'.join(code_file)

Get correct brace grouping from string

I have files with incorrect JSON that I want to start fixing by getting it into properly grouped chunks.
The brace grouping {{ {} {} } } {{}} {{{}}} should already be correct
How can I grab all the top-level braces, correctly grouped, as separate strings?
If you don't want to install any extra modules simple function will do:
def top_level(s):
depth = 0
start = -1
for i, c in enumerate(s):
if c == '{':
if depth == 0:
start = i
depth += 1
elif c == '}' and depth:
depth -= 1
if depth == 0:
yield s[start:i+1]
print(list(top_level('{{ {} {} } } {{}} {{{}}}')))
Output:
['{{ {} {} } }', '{{}}', '{{{}}}']
It will skip invalid braces but could be easily modified to report an error when they are spotted.
Using the regex module:
In [1]: import regex
In [2]: braces = regex.compile(r"\{(?:[^{}]++|(?R))*\}")
In [3]: braces.findall("{{ {} {} } } {{}} {{{}}}")
Out[3]: ['{{ {} {} } }', '{{}}', '{{{}}}']
pyparsing can be really helpful here. It will handle pathological cases where you have braces inside strings, etc. It might be a little tricky to do all of this work yourself, but fortunately, somebody (the author of the library) has already done the hard stuff for us.... I'll reproduce the code here to prevent link-rot:
# jsonParser.py
#
# Implementation of a simple JSON parser, returning a hierarchical
# ParseResults object support both list- and dict-style data access.
#
# Copyright 2006, by Paul McGuire
#
# Updated 8 Jan 2007 - fixed dict grouping bug, and made elements and
# members optional in array and object collections
#
json_bnf = """
object
{ members }
{}
members
string : value
members , string : value
array
[ elements ]
[]
elements
value
elements , value
value
string
number
object
array
true
false
null
"""
from pyparsing import *
TRUE = Keyword("true").setParseAction( replaceWith(True) )
FALSE = Keyword("false").setParseAction( replaceWith(False) )
NULL = Keyword("null").setParseAction( replaceWith(None) )
jsonString = dblQuotedString.setParseAction( removeQuotes )
jsonNumber = Combine( Optional('-') + ( '0' | Word('123456789',nums) ) +
Optional( '.' + Word(nums) ) +
Optional( Word('eE',exact=1) + Word(nums+'+-',nums) ) )
jsonObject = Forward()
jsonValue = Forward()
jsonElements = delimitedList( jsonValue )
jsonArray = Group(Suppress('[') + Optional(jsonElements) + Suppress(']') )
jsonValue << ( jsonString | jsonNumber | Group(jsonObject) | jsonArray | TRUE | FALSE | NULL )
memberDef = Group( jsonString + Suppress(':') + jsonValue )
jsonMembers = delimitedList( memberDef )
jsonObject << Dict( Suppress('{') + Optional(jsonMembers) + Suppress('}') )
jsonComment = cppStyleComment
jsonObject.ignore( jsonComment )
def convertNumbers(s,l,toks):
n = toks[0]
try:
return int(n)
except ValueError, ve:
return float(n)
jsonNumber.setParseAction( convertNumbers )
Phew! That's a lot ... Now how do we use it? The general strategy here will be to scan the string for matches and then slice those matches out of the original string. Each scan result is a tuple of the form (lex-tokens, start_index, stop_index). For our use, we don't care about the lex-tokens, just the start and stop. We could do: string[result[1], result[2]] and it would work. We can also do string[slice(*result[1:])] -- Take your pick.
results = jsonObject.scanString(testdata)
for result in results:
print '*' * 80
print testdata[slice(*result[1:])]

generated code is not indent

I am modifying the oil file using python script. I have written EBNF grammar to convert oil file to AST using Grako. And generate oil file back from AST using codegen but the Oil file is not indent (generate in one line).
Sample Oil file:
CPU dummy
{
OS StdOS
{
SCALABILITYCLASS = SC1;
STATUS = EXTENDED;
};
};
Generated Oil:
CPUdummy{OSStdOS{SCALABILITYCLASS=SC1;STATUS=EXTENDED;};};
EBNF grammer:
file = [{Comments_option}] OIL_version Includes [implementation_definition] application_definition {object_definition_list};
Includes
= "#include" include_name ;
include_name
= ?/[!-_A-Za-z0-9]+/? ;
OIL_version
= "OIL_VERSION" "=" version description ";" ;
version = '"' ver '"';
implementation_definition
= "IMPLEMENTATION" name "{" implementation_spec_list "}" description ";";
implementation_spec_list
= [implementation_spec] ;
implementation_spec
= object "{" implementation_def "}" description ";";
object = "OS"
"TASK"
"COUNTER"
"ALARM"
"RESOURCE"
"EVENT"
"ISR"
"MESSAGE"
"COM"
"NM"
"APPMODE"
"IPDU"
"APPLICATION";
implementation_list
= [implementation_def]
| [implementation_list implementation_def] ;
implementation_def
= impl_attr_def
| impl_ref_def;
impl_attr_def
= "UINT32" auto_specifier number_range attribute_name multiple_specifier default_number description ";"
| ( "INT32" | "UINT64" | "INT64" ) auto_specifier number_range attribute_name multiple_specifier default_number description ";"
| "FLOAT" auto_specifier float_range attribute_name multiple_specifier default_float description ";"
| "ENUM" auto_specifier enumeration attribute_name multiple_specifier default_name description ";"
| "STRING" auto_specifier attribute_name multiple_specifier default_string description ";"
| "BOOLEAN" auto_specifier bool_values attribute_name multiple_specifier default_bool description ";" ;
impl_parameter_list
= [( "{" {implementation_def} [implementation_def] "}" )] ;
auto_specifier
= ["WITH_AUTO"];
number_range
= [( "[" ( number ".." | ( number ) ) number "]" )];
number_list
= number
| number_list "," number ;
default_number
= [( "=" ( number | "NO_DEFAULT" | "AUTO" ) )];
description
= [( ":" '"' comments '"' )] ;
float_range
= [( "[" float ".." float "]" )] ;
default_float
= [( "=" ( float | "NO_DEFAULT" | "AUTO" ) )] ;
enumeration
= "[" enumerator_list "]";
enumerator_list
= enumerator
| enumerator_list "," enumerator ;
enumerator
= name [impl_parameter_list] description;
bool_values
= [( "[" "TRUE" impl_parameter_list description "," "FALSE" impl_parameter_list description "]" )] ;
default_name
= [( "=" ( name | "NO_DEFAULT" | "AUTO" ) )] ;
default_string
= [( "=" ( string | "NO_DEFAULT" | "AUTO" ) )] ;
default_bool
= [( "=" ( boolean | "NO_DEFAULT" | "AUTO" ) )] ;
impl_ref_def
= object_ref_type reference_name multiple_specifier description ";";
object_ref_type
= "OS_TYPE"
| "TASK_TYPE"
| "COUNTER_TYPE"
| "ALARM_TYPE"
| "RESOURCE_TYPE"
| "EVENT_TYPE"
| "ISR_TYPE"
| "MESSAGE_TYPE"
| "COM_TYPE"
| "NM_TYPE"
| "APPMODE_TYPE"
| "IPDU_TYPE";
reference_name
= name
| object;
multiple_specifier
= [( "[" "]" )] ;
application_definition
= "CPU" name "{" [Includes] { ( parameter_list Comments_option ) } "}" description ";" ;
object_definition_list
= [object_definition];
Comment_list
= object_definition | parameter comments ;
object_definition
= object_name "{" { parameter_list Comments_option } "}" description ";" ;
object_name
= object name;
parameter_list
= [parameter];
parameter
= attribute_name "=" attribute_value [ "{" { ( parameter [Comments_option] ) } "}" ] description ";" ;
attribute_name
= name
| object;
attribute_value
= boolean
| float
| number
| string
| "AUTO"
| '"' string '"';
Comments_option
= ( Single_line Multi_line );
Single_line = {"//" comments};
Multi_line = {"/*#*" Comment_list "*#*/"};
name = ?/[-_A-Za-z0-9]+/?;
string = ?/[-_A-Za-z0-9_*, ]+/?;
ver = ?/[0-9.0-9]+/?;
comments = ?/[-_A-Za-z0-9 *#]+/? ;
boolean = "FALSE"
| "TRUE";
number = dec_number
| hex_number;
dec_number
= sign int_digits;
sign = [( "+" | "-" )] ;
int_digits
= zero_digit
| pos_digit
| pos_digit dec_digits ;
dec_digits
= {dec_digit} [dec_digit] ;
float = ver;
exponent = [( ( "e" | "E" ) sign dec_digits )] ;
zero_digit
= "0";
pos_digit
= "1"
| "2"
| "3"
| "4"
| "5"
| "6"
| "7"
| "8"
| "9";
dec_digit
= zero_digit
| pos_digit;
hex_number
= "0x" {hex_digit};
hex_digit
= "A"
| "B"
| "C"
| "D"
| "E"
| "F"
| "a"
| "b"
| "c"
| "d"
| "e"
| "f"
| "0"
| "1"
| "2"
| "3"
| "4"
| "5"
| "6"
| "7"
| "8"
| "9";
For indentation grako to be taken care or codegen. How to indent the generated code. Thanks.
import json
from grako.util import asjson
print(json.dumps(asjson(myast), indent=4))

Parse C-like declarations using pyparsing

I would like to parse declarations using pyparsing in a C-like source (GLSL code) such that I get a list of (type, name, value).
For example:
int a[3];
int b=1, c=2.0;
float d = f(z[2], 2) + 3*g(4,a), e;
Point f = {1,2};
I would like to obtain something like:
[ ('int', 'a[3]', ''),
('int', 'b', '1'),
('int', 'c', '2.0'),
('float', 'd', 'f(z[2], 2) + 3*g(4,a)'),
('float', 'e', ''),
('Point', 'f', '{1,2}') ]
I've played with Forward() and operatorPrecedence() to try to parse the rhs expression but I suspect it is not necessary in my case.
So far I have:
IDENTIFIER = Regex('[a-zA-Z_][a-zA-Z_0-9]*')
INTEGER = Regex('([+-]?(([1-9][0-9]*)|0+))')
EQUAL = Literal("=").suppress()
SEMI = Literal(";").suppress()
SIZE = INTEGER | IDENTIFIER
VARNAME = IDENTIFIER
TYPENAME = IDENTIFIER
VARIABLE = Group(VARNAME.setResultsName("name")
+ Optional(EQUAL + Regex("[^,;]*").setResultsName("value")))
VARIABLES = delimitedList(VARIABLE.setResultsName("variable",listAllMatches=True))
DECLARATION = (TYPENAME.setResultsName("type")
+ VARIABLES.setResultsName("variables", listAllMatches=True) + SEMI)
code = """
float a=1, b=3+f(2), c;
float d=1.0, e;
float f = z(3,4);
"""
for (token, start, end) in DECLARATION.scanString(code):
for variable in token.variable:
print token.type, variable.name, variable.value
but the last expression (f=z(3,4)) is not parsed because of the ,.
There is a C struct parser on the pyparsing wiki that might give you a good start.
This seems to work.
IDENTIFIER = Word(alphas+"_", alphas+nums+"_" )
INT_DECIMAL = Regex('([+-]?(([1-9][0-9]*)|0+))')
INT_OCTAL = Regex('(0[0-7]*)')
INT_HEXADECIMAL = Regex('(0[xX][0-9a-fA-F]*)')
INTEGER = INT_HEXADECIMAL | INT_OCTAL | INT_DECIMAL
FLOAT = Regex('[+-]?(((\d+\.\d*)|(\d*\.\d+))([eE][-+]?\d+)?)|(\d*[eE][+-]?\d+)')
LPAREN, RPAREN = Literal("(").suppress(), Literal(")").suppress()
LBRACK, RBRACK = Literal("[").suppress(), Literal("]").suppress()
LBRACE, RBRACE = Literal("{").suppress(), Literal("}").suppress()
SEMICOLON, COMMA = Literal(";").suppress(), Literal(",").suppress()
EQUAL = Literal("=").suppress()
SIZE = INTEGER | IDENTIFIER
VARNAME = IDENTIFIER
TYPENAME = IDENTIFIER
OPERATOR = oneOf("+ - * / [ ] . & ^ ! { }")
PART = nestedExpr() | nestedExpr('{','}') | IDENTIFIER | INTEGER | FLOAT | OPERATOR
EXPR = delimitedList(PART, delim=Empty()).setParseAction(keepOriginalText)
VARIABLE = (VARNAME("name") + Optional(LBRACK + SIZE + RBRACK)("size")
+ Optional(EQUAL + EXPR)("value"))
VARIABLES = delimitedList(VARIABLE.setResultsName("variables",listAllMatches=True))
DECLARATION = (TYPENAME("type") + VARIABLES + SEMICOLON)
code = """
int a[3];
int b=1, c=2.0;
float d = f(z[2], 2) + 3*g(4,a), e;
Point f = {1,2};
"""
for (token, start, end) in DECLARATION.scanString(code):
vtype = token.type
for variable in token.variables:
name = variable.name
size = variable.size
value = variable.value
s = "%s / %s" % (vtype,name)
if size: s += ' [%s]' % size[0]
if value: s += ' / %s' % value[0]
s += ";"
print s

Cannot parse correctly this file with pyparsing

I am trying to parse a file using the amazing python library pyparsing but I am having a lot of problems...
The file I am trying to parse is something like:
sectionOne:
list:
- XXitem
- XXanotherItem
key1: value1
product: milk
release: now
subSection:
skey : sval
slist:
- XXitem
mods:
- XXone
- XXtwo
version: last
sectionTwo:
base: base-0.1
config: config-7.0-7
As you can see is an indented configuration file, and this is more or less how I have tried to define the grammar
The file can have one or more sections
Each section is formed by a section name and a section content.
Each section have an indented content
Each section content can have one or more pairs of key/value or a subsection.
Each value can be just a single word or a list of items.
A list of items is a group of one or more items.
Each item is an HYPHEN + a name starting with 'XX'
I have tried to create this grammar using pyparsing but with no success.
import pprint
import pyparsing
NEWLINE = pyparsing.LineEnd().suppress()
VALID_CHARACTERS = pyparsing.srange("[a-zA-Z0-9_\-\.]")
COLON = pyparsing.Suppress(pyparsing.Literal(":"))
HYPHEN = pyparsing.Suppress(pyparsing.Literal("-"))
XX = pyparsing.Literal("XX")
list_item = HYPHEN + pyparsing.Combine(XX + pyparsing.Word(VALID_CHARACTERS))
list_of_items = pyparsing.Group(pyparsing.OneOrMore(list_item))
key = pyparsing.Word(VALID_CHARACTERS) + COLON
pair_value = pyparsing.Word(VALID_CHARACTERS) + NEWLINE
value = (pair_value | list_of_items)
pair = pyparsing.Group(key + value)
indentStack = [1]
section = pyparsing.Forward()
section_name = pyparsing.Word(VALID_CHARACTERS) + COLON
section_value = pyparsing.OneOrMore(pair | section)
section_content = pyparsing.indentedBlock(section_value, indentStack, True)
section << pyparsing.Group(section_name + section_content)
parser = pyparsing.OneOrMore(section)
def main():
try:
with open('simple.info', 'r') as content_file:
content = content_file.read()
print "content:\n", content
print "\n"
result = parser.parseString(content)
print "result1:\n", result
print "len", len(result)
pprint.pprint(result.asList())
except pyparsing.ParseException, err:
print err.line
print " " * (err.column - 1) + "^"
print err
except pyparsing.ParseFatalException, err:
print err.line
print " " * (err.column - 1) + "^"
print err
if __name__ == '__main__':
main()
This is the result :
result1:
[['sectionOne', [[['list', ['XXitem', 'XXanotherItem']], ['key1', 'value1'], ['product', 'milk'], ['release', 'now'], ['subSection', [[['skey', 'sval'], ['slist', ['XXitem']], ['mods', ['XXone', 'XXtwo']], ['version', 'last']]]]]]], ['sectionTwo', [[['base', 'base-0.1'], ['config', 'config-7.0-7']]]]]
len 2
[
['sectionOne',
[[
['list', ['XXitem', 'XXanotherItem']],
['key1', 'value1'],
['product', 'milk'],
['release', 'now'],
['subSection',
[[
['skey', 'sval'],
['slist', ['XXitem']],
['mods', ['XXone', 'XXtwo']],
['version', 'last']
]]
]
]]
],
['sectionTwo',
[[
['base', 'base-0.1'],
['config', 'config-7.0-7']
]]
]
]
As you can see I have two main problems:
1.- Each section content is nested twice into a list
2.- the key "version" is parsed inside the "subSection" when it belongs to the "sectionOne"
My real target is to be able to get a structure of python nested dictionaries with the keys and values to easily extract the info for each field, but the pyparsing.Dict is something obscure to me.
Could anyone please help me ?
Thanks in advance
( sorry for the long post )
You really are pretty close - congrats, indented parsers are not the easiest to write with pyparsing.
Look at the commented changes. Those marked with 'A' are changes to fix your two stated problems. Those marked with 'B' add Dict constructs so that you can access the parsed data as a nested structure using the names in the config.
The biggest culprit is that indentedBlock does some extra Group'ing for you, which gets in the way of Dict's name-value associations. Using ungroup to peel that away lets Dict see the underlying pairs.
Best of luck with pyparsing!
import pprint
import pyparsing
NEWLINE = pyparsing.LineEnd().suppress()
VALID_CHARACTERS = pyparsing.srange("[a-zA-Z0-9_\-\.]")
COLON = pyparsing.Suppress(pyparsing.Literal(":"))
HYPHEN = pyparsing.Suppress(pyparsing.Literal("-"))
XX = pyparsing.Literal("XX")
list_item = HYPHEN + pyparsing.Combine(XX + pyparsing.Word(VALID_CHARACTERS))
list_of_items = pyparsing.Group(pyparsing.OneOrMore(list_item))
key = pyparsing.Word(VALID_CHARACTERS) + COLON
pair_value = pyparsing.Word(VALID_CHARACTERS) + NEWLINE
value = (pair_value | list_of_items)
#~ A: pair = pyparsing.Group(key + value)
pair = (key + value)
indentStack = [1]
section = pyparsing.Forward()
section_name = pyparsing.Word(VALID_CHARACTERS) + COLON
#~ A: section_value = pyparsing.OneOrMore(pair | section)
section_value = (pair | section)
#~ B: section_content = pyparsing.indentedBlock(section_value, indentStack, True)
section_content = pyparsing.Dict(pyparsing.ungroup(pyparsing.indentedBlock(section_value, indentStack, True)))
#~ A: section << Group(section_name + section_content)
section << (section_name + section_content)
#~ B: parser = pyparsing.OneOrMore(section)
parser = pyparsing.Dict(pyparsing.OneOrMore(pyparsing.Group(section)))
Now instead of pprint(result.asList()) you can write:
print (result.dump())
to show the Dict hierarchy:
[['sectionOne', ['list', ['XXitem', 'XXanotherItem']], ... etc. ...
- sectionOne: [['list', ['XXitem', 'XXanotherItem']], ... etc. ...
- key1: value1
- list: ['XXitem', 'XXanotherItem']
- mods: ['XXone', 'XXtwo']
- product: milk
- release: now
- subSection: [['skey', 'sval'], ['slist', ['XXitem']]]
- skey: sval
- slist: ['XXitem']
- version: last
- sectionTwo: [['base', 'base-0.1'], ['config', 'config-7.0-7']]
- base: base-0.1
- config: config-7.0-7
allowing you to write statements like:
print (result.sectionTwo.base)

Categories