Recursive function for nested dictionaries - python

I have a text file which is in asn format ,Now im writing my own parser where here in this example,It creates a dictionary for Order initially and then goes inside the items and see if the value is not a dictionary, Dictionaries in the file had been identified and kept in seq_list.Now i need to write a recursive function which goes inside all the dictionaries and create nested dictionaries.
import re
ee='\
Module-order DEFINITIONS AUTOMATIC TAGS ::=\
BEGIN\
Order ::= SEQUENCE {\
header Order-header\
}\
Order-header ::= SEQUENCE {\
reference NumericString (SIZE (12)),\
date NumericString (SIZE (8)) -- MMDDYYYY --\
}END'
seq_list=['Order','Order-header']
condition='Order ::= SEQUENCE {\
header Order-header\
}'
def rec_fn():
ee=ee.lower()
ee=ee.replace('\n','')
for i in condition:
# Removes emty items
i=i.split(' ')
k.append(filter(None, i))
for index_content,content in enumerate(k):
for index,value in enumerate(content[1:]):
new_value=value.replace(',','')
if new_value in seq_list:
# will have the contents of all the items of the new
# dictionary found.
reg_value=re.findall(r'{0}\s*::=\s*sequence(.*?)(::=|end)'.format(new_value),ee)
sample.asn
ee=''' Module-order DEFINITIONS AUTOMATIC TAGS ::=
BEGIN
Order ::= SEQUENCE {
header Order-header,
items SEQUENCE OF Order-line }
Order-header ::= SEQUENCE {
reference NumericString (SIZE (12)),
date NumericString (SIZE (8)) -- MMDDYYYY --,
client Client,
payment Payment-method }
Client ::= SEQUENCE {
name PrintableString (SIZE (1..20)),
street PrintableString (SIZE (1..50)) OPTIONAL,
postcode NumericString (SIZE (5)),
town PrintableString (SIZE (1..30)),
country PrintableString (SIZE (1..20)) DEFAULT "France" }
Payment-method ::= CHOICE {
check NumericString (SIZE (15)),
credit-card Credit-card,
cash NULL }
Credit-card ::= SEQUENCE {
type Card-type,
number NumericString (SIZE (20)),
expiry-date NumericString (SIZE (6)) -- MMYYYY -- }
Card-type ::= ENUMERATED {cb(0), visa(1), eurocard(2), diners(3), american-express(4)}END

You can use the following recursive function:
import re
def rec_fn(asn, key):
def build_definitions(mapping, key, sequence_only=False):
if not sequence_only and (key,) in mapping:
key = (key,)
is_choice = True
else:
is_choice = False
if isinstance(mapping[key], dict):
definitions = {}
for variable, definition in mapping[key].items():
if definition in mapping or (definition,) in mapping:
definitions[variable] = build_definitions(mapping, definition, sequence_only=is_choice)
else:
definitions[variable] = definition
return definitions
else:
return mapping[key]
mapping = {}
for name, type, definition in re.findall(r'([A-Za-z-]+)\s*::=\s*(SEQUENCE|CHOICE|ENUMERATED)\s*{(.*?)}(?=\s*(?:[A-Za-z-]+\s*::=\s*(?:SEQUENCE|CHOICE|ENUMERATED)|END)\b)', asn, flags=re.DOTALL):
if type in ('SEQUENCE', 'CHOICE'):
for definitions in re.sub(r'{[^}]*}', '', definition).split(','):
definitions = re.sub(r'\bSET OF\b|\(.*\).*', '', definitions).strip().split(maxsplit=1)
if definitions:
mapping.setdefault(name if type == 'SEQUENCE' else (name,), {})[definitions[0]] = definitions[1]
elif type == 'ENUMERATED':
mapping[name] = re.findall(r'[A-Za-z-]+', definition)
return build_definitions(mapping, key)
so that with (note that it's better use triple-quotes for a multi-line string literal):
ee='''
Module-order DEFINITIONS AUTOMATIC TAGS ::=
BEGIN
Order ::= SEQUENCE {
header Order-header
}
Order-header ::= SEQUENCE {
reference NumericString (SIZE (12)),
date NumericString (SIZE (8)) -- MMDDYYYY --
}END
seq_list=['Order','Order-header']
condition='Order ::= SEQUENCE {
header Order-header
}'''
rec_fn(ee, 'Order') will return:
{'header': {'reference': 'NumericString', 'date': 'NumericString'}}

Related

python - extract text from microsoft word

I'm trying to extract text in specific parts of an MS word document (link) - sample below. Essentially I need to write all text with the tags -- ASN1START and -- ASN1STOP to a file excluding the aforementioned tags.
sample text
-- ASN1START
CounterCheck ::= SEQUENCE {
rrc-TransactionIdentifier RRC-TransactionIdentifier,
criticalExtensions CHOICE {
c1 CHOICE {
counterCheck-r8 CounterCheck-r8-IEs,
spare3 NULL, spare2 NULL, spare1 NULL
},
criticalExtensionsFuture SEQUENCE {}
}
}
CounterCheck-r8-IEs ::= SEQUENCE {
drb-CountMSB-InfoList DRB-CountMSB-InfoList,
nonCriticalExtension CounterCheck-v8a0-IEs OPTIONAL
}
CounterCheck-v8a0-IEs ::= SEQUENCE {
lateNonCriticalExtension OCTET STRING OPTIONAL,
nonCriticalExtension CounterCheck-v1530-IEs OPTIONAL
}
CounterCheck-v1530-IEs ::= SEQUENCE {
drb-CountMSB-InfoListExt-r15 DRB-CountMSB-InfoListExt-r15 OPTIONAL, -- Need ON
nonCriticalExtension SEQUENCE {} OPTIONAL
}
DRB-CountMSB-InfoList ::= SEQUENCE (SIZE (1..maxDRB)) OF DRB-CountMSB-Info
DRB-CountMSB-InfoListExt-r15 ::= SEQUENCE (SIZE (1..maxDRBExt-r15)) OF DRB-CountMSB-Info
DRB-CountMSB-Info ::= SEQUENCE {
drb-Identity DRB-Identity,
countMSB-Uplink INTEGER(0..33554431),
countMSB-Downlink INTEGER(0..33554431)
}
-- ASN1STOP
I have tried using docx.
from docx import *
import re
import json
fileName = './data/36331-f80.docx'
document = Document(fileName)
startText = re.compile(r'-- ASN1START')
for para in document.paragraphs:
# look for each paragraph
text = para.text
print(text)
# if startText.match(para.text):
# print(text)
It seems every line here with the tags mentioned above is a paragraph. I need help with extracting just the text within the tags.
You may try first reading all document/paragraph text into a single string, and then using re.findall to find all matching text in between the target tags:
text = ""
for para in document.paragraphs:
text += para.text + "\n"
matches = re.findall(r'-- ASN1START\s*(.*?)\s*-- ASN1STOP', text, flags=re.DOTALL)
Note that we use DOT ALL mode with the regex to ensure that .* can match content in between the tags which occurs across newlines.

Parsing a custom format (curly braces separated) text configuration with Pyparsing

I need to parse some load balancer configuration section. It's seemingly simple (at least for a human).
Config consists of several objects with their content in curly braces like so:
ltm rule ssl-header-insert {
when HTTP_REQUEST {
HTTP::header insert "X-SSL-Connection" "yes"
}
}
ltm rule some_redirect {
priority 1
when HTTP_REQUEST {
if { (not [class match [IP::remote_addr] equals addresses_group ]) }
{
HTTP::redirect "http://some.page.example.com"
TCP::close
event disable all
}
}
The contents of each section/object is a TCL code so there will be nested curly braces. What I want to achieve is to parse this in pairs as: object identifier (after ltm rule keywords) and it's contents (tcl code within braces) as it is.
I've looked around some examples and experimented a lot, but it's really giving me a hard time. I did some debugging within pyparsing (which is a bit confusing to me too) and I think that I'm failing to detect closing braces somehow, but can't figure that out.
What I came up with so far:
from pyparsing import *
import json
list_sample = """ltm rule ssl-header-insert {
when HTTP_REQUEST {
HTTP::header insert "X-SSL-Connection" "yes"
}
}
ltm rule some_redirect {
priority 1
when HTTP_REQUEST {
if { (not [class match [IP::remote_addr] equals addresses_group ]) }
{
HTTP::redirect "http://some.page.example.com"
TCP::close
event disable all
}
}
}
ltm rule http_header_replace {
when HTTP_REQUEST {
HTTP::header replace Host some.host.example.com
}
}"""
ParserElement.defaultWhitespaceChars=(" \t")
NL = LineEnd()
END = StringEnd()
LBRACE, RBRACE = map(Suppress, '{}')
ANY_HEADER = Suppress("ltm rule ") + Word(alphas, alphanums + "_-")
END_MARK = Literal("ltm rule")
CONTENT_LINE = (~ANY_HEADER + (NotAny(RBRACE + FollowedBy(END_MARK)) + ~END + restOfLine) | (~ANY_HEADER + NotAny(RBRACE + FollowedBy(END)) + ~END + restOfLine)) | (~RBRACE + ~END + restOfLine)
ANY_HEADER.setName("HEADER").setDebug()
LBRACE.setName("LBRACE").setDebug()
RBRACE.setName("RBRACE").setDebug()
CONTENT_LINE.setName("LINE").setDebug()
template_defn = ZeroOrMore((ANY_HEADER + LBRACE +
Group(ZeroOrMore(CONTENT_LINE)) +
RBRACE))
template_defn.ignore(NL)
results = template_defn.parseString(list_sample).asList()
print("Raw print:")
print(results)
print("----------------------------------------------")
print("JSON pretty dump:")
print json.dumps(results, indent=2)
I see in the debug that some of the matches work but in the end it fails with an empty list as a result.
On a sidenote - my CONTENT_LINE part of the grammar is probably overly complicated in general, but I didn't find any simpler way to cover it so far.
The next thing would be to figure out how to preserve new lines and tabs in content part, since I need that to be unchanged in the output. But looks like I have to use ignore() function - which is skipping new lines - to parse the multiline text in the first place, so that's another challenge.
I'd be grateful for someone to help me find out what the issues are. Or maybe I should take some other approach?
I think nestedExpr('{', '}') will help. That will take care of the nested '{}'s, and wrapping in originalTextFor will preserve newlines and spaces.
import pyparsing as pp
LTM, RULE = map(pp.Keyword, "ltm rule".split())
ident = pp.Word(pp.alphas, pp.alphanums+'-_')
ltm_rule_expr = pp.Group(LTM + RULE
+ ident('name')
+ pp.originalTextFor(pp.nestedExpr('{', '}'))('body'))
Using your sample string (after adding missing trailing '}'):
for rule, _, _ in ltm_rule_expr.scanString(sample):
print(rule[0].name, rule[0].body.splitlines()[0:2])
gives
ssl-header-insert ['{', ' when HTTP_REQUEST {']
some_redirect ['{', ' priority 1']
dump() is also a good way to list out the contents of a returned ParseResults:
for rule, _, _ in ltm_rule_expr.scanString(sample):
print(rule[0].dump())
print()
['ltm', 'rule', 'ssl-header-insert', '{\n when HTTP_REQUEST {\n HTTP::header insert "X-SSL-Connection" "yes"\n}\n}']
- body: '{\n when HTTP_REQUEST {\n HTTP::header insert "X-SSL-Connection" "yes"\n}\n}'
- name: 'ssl-header-insert'
['ltm', 'rule', 'some_redirect', '{\n priority 1\n\nwhen HTTP_REQUEST {\n\n if { (not [class match [IP::remote_addr] equals addresses_group ]) }\n {\n HTTP::redirect "http://some.page.example.com"\n TCP::close\n event disable all\n }\n}}']
- body: '{\n priority 1\n\nwhen HTTP_REQUEST {\n\n if { (not [class match [IP::remote_addr] equals addresses_group ]) }\n {\n HTTP::redirect "http://some.page.example.com"\n TCP::close\n event disable all\n }\n}}'
- name: 'some_redirect'
Note that I broke up 'ltm' and 'rule' into separate keyword expressions. This guards against the case where a developer may have written valid code as ltm rule blah, with > 1 space between "ltm" and "rule". This kind of thing happens all the time, you never know where whitespace will crop up.

How to print out the ! in pyaml?

I have code to print out a dict as YAML as so:
import yaml
yaml.dump(
{
"Properties":
{
"ImageId": "!Ref AParameter"
}
},
new_template,
default_flow_style=False
)
This creates:
Properties:
ImageId: '!Ref AParameter'
Notice how the value for ImageId is inside quotes? I would like to print without the quotes. How do I do that with PyYAML?
The ! has a special meaning, as it is used to introduce an explicit tag, and therefore cannot appear at the beginning of a plain (unquoted) style scalar. Specifically rule 126 of the YAML 1.2 specification indicates that the first character of such a plain scalar cannot be a c-indicator, which is what ! is.
Such a scalar has to be quoted (single or double) which PyYAML does automatically, or be put in a literal or folding block style.
You could dump valid YAML without quotes to a literal block style scalar:
Properties:
ImageId: |
!Ref AParameter
Without supportive programming PyYAML cannot do this. You can use ruamel.yaml to do so (disclaimer: I am the author of that package) by making the value a PreservedScalarString instance: ruamel.yaml.scalarstring.PreservedScalarString("!Ref AParameter")
You can of course define a class that dumps using the !Ref tag, but the tag context will force quotes around the scalar AParameter:
import sys
import yaml
class Ref(str):
#staticmethod
def yaml_dumper(dumper, data):
return dumper.represent_scalar('!Ref', u'{}'.format(data), style=None)
yaml.add_representer(Ref, Ref.yaml_dumper)
yaml.dump(
{
"Properties":
{
"ImageId": Ref("AParameter"),
}
},
sys.stdout,
default_flow_style=False,
)
which gives:
Properties:
ImageId: !Ref 'AParameter'
This although loading !Ref Aparameter with an appropriate constructor is possible (i.e. the quotes are just added here to be on the safe side).
If you also want to suppress those quotes, you can e.g. do so using ruamel.yaml, by defining a special style 'x' for your node and providing emitting processing for that:
from ruamel import yaml
class Ref(str):
#staticmethod
def yaml_dumper(dumper, data):
return dumper.represent_scalar('!Ref', u'{}'.format(data), style='x')
#staticmethod
def yaml_constructor(loader, node):
value = loader.construct_scalar(node)
return Ref(value)
yaml.add_representer(Ref, Ref.yaml_dumper)
yaml.add_constructor('!Ref', Ref.yaml_constructor,
constructor=yaml.constructor.SafeConstructor)
def choose_scalar_style(self):
if self.event.style == 'x':
return ''
return self.org_choose_scalar_style()
yaml.emitter.Emitter.org_choose_scalar_style = yaml.emitter.Emitter.choose_scalar_style
yaml.emitter.Emitter.choose_scalar_style = choose_scalar_style
data = {
"Properties":
{
"ImageId": Ref("AParameter"),
}
}
ys = yaml.dump(data, default_flow_style=False)
print(ys)
data_out = yaml.safe_load(ys)
assert data_out == data
the above doesn't throw an error on the assert, so the data round-trips and the printed output is AFAICT exactly what you want:
Properties:
ImageId: !Ref AParameter

How to use datastore GAE in Go when initially it was created in Python?

I have a datastore kind "Items" which was created in Python, in this code do not iterate data q.Run() in Go (it's version 2):
type Items struct{
code string
date time.Time
name string
}
func getcode(w http.ResponseWriter, r *http.Request) {
code := mux.Vars(r)["code"]
fmt.Fprintf(w,"get code %v",code)
c := appengine.NewContext(r)
q := datastore.NewQuery("Items")
for t := q.Run(c); ; {
var x Items
key, err := t.Next(&x)
fmt.Fprintf(w,"%v",key)
if err == datastore.Done {
break
}
if err != nil {
//serveError(c, w, err)
return
}
fmt.Fprintf(w, "Code=%v\n", x.code)
}
The Datastore package uses reflection to fill struct fields when reading an entity from the datastore. In Go struct fields whose name start with lowercase letter are not exported. Unexported fields cannot be set from packages other than the one they were defined in.
Only exported fields (that start with uppercase letters) can be stored in / retrieved from the datastore. You can use tags to tell what the name of the property is in the datastore in case it differs from the field's name. So you have to change your Items struct to this:
type Items struct {
Code string `datastore:"code"`
Date time.Time `datastore:"date"`
Name string `datastore:"name"`
}

Extracting BIND parameters to build a JSON query

I have a file which was exported from BIND containing TSIG values for about 500 domain names. I need to repurpose the data into JSON for a REST API query. The BIND data is formatted like so:
// secondary-example.com.
key "2000000000000.key." {
algorithm hmac-md5;
secret "ahashedvalue=";
};
zone "secondary-example.com." {
type slave;
file "sec/secondary-example.com.";
allow-transfer { 1.1.1.1;
1.1.2.2;
};
also-notify { 1.1.1.1;
2.2.2.2;
};
masters {
1.2.3.4 key 2000000000000.key.;
};
};
From this I need to extract the key, zone and secret. Here's an example API request.
{
"properties":{
"name":"secondary-example.com.",
"accountName":"example",
"type":"SECONDARY"
},
"secondaryCreateInfo":{
"primaryNameServers":{
"nameServerIpList":{
"nameServerIp1":{
"ip":"1.2.3.4",
"tsigKey":"2000000000000.key.",
"tsigKeyValue":"ahashedvalue="
}
}
}
}
}
I'm having difficulty crafting a regular expression appropriate for the scenario. I'm looking construct the JSON in a python script and send the request through Postman.
I spent a couple days reading up on regex and figured out a solution. So, each of those "zones" began with a comment... e.g. "secondary-example.com"... and each set of BIND info was 17 lines long exactly. This solution is hackey and always assumes data is correct, but it managed to work.
Separate the zones into chunks of text.
zones = []
cur_zone = ''
f = open(bind_file).readlines()
for line in f:
if line[0:2] == '//':
zones.append(cur_zone)
cur_zone = ''
else:
cur_zone = cur_zone + line
zones.pop(0) # Drop the first list item, it's empty
Iterate through those chunks and match the needed parameters.
for z in zones:
z_lines = z.splitlines()
# Regex patterns to make the required parameters
key = re.findall('\"(.*)\"', z_lines[0])[0]
secret = re.findall('\"(.*)\"', z_lines[2])[0]
name = re.findall('\"(.*)\"', z_lines[5])[0]
master = re.findall('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', z_lines[15])[0]

Categories