Read patterns Simple DSL using Python SLY - python

I try to create simple DSL with Python SLY . But, I can't get the result as I expected because the parser can't read it properly. So here the code :
Lexer
from sly import Lexer
class ConfigLexer(Lexer):
tokens = { ANIMALS, BLOOD, SKIN, BREATHE, ANIMAL_NAME, VALUE, ASSIGN }
ignore = " \t\r"
ignore_newline = r'\n+'
ANIMALS = "ANIMALS"
BLOOD = "BLOOD"
SKIN = "SKIN"
BREATHE = "BREATHE"
ANIMAL_NAME = r'\{[a-zA-Z_][a-zA-Z0-9_]*\}'
VALUE = r'[a-zA-Z_][a-zA-Z0-9_,.: ]*'
ASSIGN = r'\='
Parser
from sly import Parser
class ConfigParser(Parser):
tokens = ConfigLexer.tokens
def __init__(self):
self.config = dict()
self.dict_attribute = dict()
self.animal_name = ""
#_("ANIMALS animaldetails")
def animals(self, p):
pass
#_("ANIMAL_NAME animalnamedetails")
def animaldetails(self, p):
self.animal_name = p.ANIMAL_NAME.replace("{", "").replace("}","")
if self.animal_name not in self.config:
self.config[self.animal_name] = self.dict_attribute
#_("BLOOD ASSIGN VALUE")
def animalnamedetails(self, p):
if p.BLOOD not in self.dict_attribute:
self.dict_attribute[p.BLOOD] = p.VALUE
#_("SKIN ASSIGN VALUE")
def animalnamedetails(self, p):
if p.SKIN not in self.dict_attribute:
self.dict_attribute[p.SKIN] = p.VALUE
#_("BREATHE ASSIGN VALUE")
def animalnamedetails(self, p):
if p.BREATHE not in self.dict_attribute:
self.dict_attribute[p.BREATHE] = p.VALUE
def get_config(self):
return self.config
but when I run it.
import json
import ConfigLexer
import ConfigParser
if __name__ == '__main__':
lexer = ConfigLexer()
parser = ConfigParser()
long_string = """ANIMALS
{MAMMALS}
BLOOD = WARM
SKIN = FUR
BREATHE = LUNGS
{FISH}
BLOOD = COLD
SKIN = SCALY
BREATHE = GILLS"""
result = parser.parse(lexer.tokenize(long_string))
cfg = parser.get_config()
data_json = json.dumps(cfg, indent=3)
print(data_json)
as I expected, the result would be like this.
data_json = {
'MAMMALS': {'BLOOD': 'WAMR': 'SKIN': 'FUR OR HAIR', 'BREATHE': 'LUNGS'},
'FISH': {'BLOOD': 'COLD', 'SKIN': 'SCALY', 'BREATHE': 'GILLS'}
}
but I only get something like this.
data_json = {
'MAMMALS': {
'BLOOD': 'WARM'
}
}
result of executing :
sly: Syntax error at line 1, token=SKIN
{
"MAMMALS": {
"BLOOD": "WARM"
}
}
I guess I have to edit the Parser, but I can't think how, and would appreciate any pointers you can give me.

You have non-terminals named animals, animaldetails, and animalnameddetails, in plural, which would normally lead one to expect that the grammar for each of them would allow a sequence of things. But they don't. Each of these categories parses a single thing. You've implemented the singular, and although it's named in plural, there's no repetition.
That this was not your intent is evident from your example, which does have multiple sections and multiple attributes in each section. But since the grammar only describes one attribute and value, the second one is a syntax error.
Traditionally, grammars will implement sequences with pairs of non-terminals; a singular non-terminal which describes a single thing, and a plural non-terminal which describes how lists are formed (simple concatenation, or separated by punctuation). So you might have:
file: sections
sections: empty
| sections section
section: category attributes
settings: empty
| settings setting
setting: attribute '=' value
You probably should also look fora description of how to manage semantic values. Storing intermediate results in class members, as you do, works only when the grammar doesn't allow nesting, which is relatively unusual. It's a technique which will almost always get you into trouble. The semantic actions of each production should manage these values:
A singular object syntax should create and return a representation of the object.
A plural→empty production should create and return a representation of an empty collection.
Similarly, a production of the form things→ things thing should append the new thing to the aggregate of things, and then return the augmented aggregate.

Cheers...
from json import dumps
from sly import Lexer, Parser
class MyLexer(Lexer):
tokens = {ANIMALS, ANIMAL_NAME, BLOOD, SKIN, BREATHE, ASSIGN, ASSIGN_VALUE}
ignore = ' \t'
ANIMALS = r'ANIMALS'
BLOOD = r'BLOOD'
SKIN = r'SKIN'
BREATHE = r'BREATHE'
ASSIGN = r'='
ASSIGN_VALUE = r'[a-zA-Z_][a-zA-Z0-9_]*'
#_(r'\{[a-zA-Z_][a-zA-Z0-9_]*\}')
def ANIMAL_NAME(self, t):
t.value = str(t.value).lstrip('{').rstrip('}')
return t
#_(r'\n+')
def NEWLINE(self, t):
self.lineno += t.value.count('\n')
class MyParser(Parser):
tokens = MyLexer.tokens
def __init__(self):
self.__config = {}
def __del__(self):
print(dumps(self.__config, indent=4))
#_('ANIMALS animal animal')
def animals(self, p):
pass
#_('ANIMAL_NAME assignment assignment assignment')
def animal(self, p):
if p.ANIMAL_NAME not in self.__config:
self.__config[p.ANIMAL_NAME] = {}
animal_name, *assignments = p._slice
for assignment in assignments:
assignment_key, assignment_value = assignment.value
self.__config[p.ANIMAL_NAME][assignment_key] = assignment_value
#_('key ASSIGN ASSIGN_VALUE')
def assignment(self, p):
return p.key, p.ASSIGN_VALUE
#_('BLOOD', 'SKIN', 'BREATHE')
def key(self, p):
return p[0]
if __name__ == '__main__':
lexer = MyLexer()
parser = MyParser()
text = '''ANIMALS
{MAMMALS}
BLOOD = WARM
SKIN = FUR
BREATHE = LUNGS
{FISH}
BLOOD = COLD
SKIN = SCALY
BREATHE = GILLS
'''
parser.parse(lexer.tokenize(text))
Output:
{
"MAMMALS": {
"BLOOD": "WARM",
"SKIN": "FUR",
"BREATHE": "LUNGS"
},
"FISH": {
"BLOOD": "COLD",
"SKIN": "SCALY",
"BREATHE": "GILLS"
}
}

Related

Object oriented programming with abstract class

I want to achieve the below:
def do_something(request):
company_name = request.get("company_name", DEFAULT_COMPANY)
data = request.get("data")
response = transform_data_according_to(data, company_name)
return response
I did the following for it:
class Transform(ABC):
def __init__(self, data):
self.data = data
#abstractmethod
def transform(self):
pass
class CompanyA(Transform):
def transform(self):
# do_transformation
return transformed_data
def do_something(request):
company_name = request.get("company_name", DEFAULT_COMPANY)
data = request.get("data")
if company_name == CompanyA:
response = CompanyA.transform(data)
return response
Instead i would like to do something like this using correct object oriented principles:
def do_something(request):
company_name = request.get("company_name", DEFAULT_COMPANY)
data = request.get("data")
response = Transform(data, company_name)
return response
I want to know where I might be thinking wrong in terms of the desired approach versus the implemented approach. Is the implemented approach correct, the if else checks can grow quite big in that case.
Thanks to teraflop
The simple, idiomatic way to do this in Python would be to look up the Transform subclass in a dictionary:
transform_classes = {
"CompanyA": CompanyA,
# ...
}
def do_something(request):
company_name = request.get("company_name", DEFAULT_COMPANY)
data = request.get("data")
transformer = transform_classes[company_name](data)
return transformer.transform()
If you prefer to be more rigorously object-oriented, you could wrap the dictionary in an object (e.g. TransformLookupByName) instead of accessing it directly.
There are also various kinds of metaprogramming magic you can use to build the dictionary automatically without having to name each subclass explicitly. For example, this will collect all of the Transform subclasses in the current source file:
transform_classes = {
k:v for k,v in globals().items()
if isinstance(v, type) and issubclass(v, Transform) and v != Transform
}

Most efficient way to handle big number of constants

I am writing a program that, depending on a certain values from an Excel table, makes an API call. There are 2 conditions from the table that will be checked:
Language
Provider
Depending on those two values a different set of constants is needed for the API call:
def run_workflow(provider, language, workflow):
if provider == 'xxxx' and language == 0:
wf_ready = provider_ready
wf_unverified = provider_unverified
wf_active = provider_active
wf_another = provider_another
wf_closed = provider_closed
wf_wrongid = provider_wrongid
elif provider == 'yyyy' and language == 0:
wf_ready = provider_ready
wf_unverified = provider_unverified
wf_active = provider_active
wf_another = provider_another
wf_closed = provider_closed
wf_wrongid = provider_wrongid
elif ...
if workflow == 'ready':
response = requests.post(API + wf_ready),headers=header, data=json.dumps(conversation))
elif workflow == 'unverified':
response = requests.post(API + wf_unverified),headers=header, data=json.dumps(conversation))
elif ...
There are 2 provider and 7 different languages and I am trying to figure out the most efficient (and Pythonic way) to handle this scenario and came up with creating a class for each language:
class Workflow_Language():
def english(self):
self.provider_unverified = 1112
self.provider_ready = 1113
self.provider_active = 1114
self.provider_vip = 1115
def russian(self):
self.provider_unverified = 1116
self.provider_ready = 1117
self.provider_active = 1118
self.provider_vip = 1119
def ...
...
Is there maybe a better way to handle this?
One way is to map constants to appropriate handlers:
class LanguageData:
def __init__(self, unverified, ready, active, vip):
self.unverified = unverified
self.ready = ready
self.active = active
self.vip = vip
def english():
return LanguageData(1,2,3,4)
def russian():
return LanguageData(5,6,7,8)
LANGUAGE_MAP = {'en': english, 'ru': russian}
I've made up 'en', 'ru' values for clarity. It seems that 0 is in your case? Also note that english and russian are standalone functions. Finally the LanguageData class is not mandatory, you can simply return a dictionary from those functions. But workin with attributes instead of string keys seems easier to maintain.
And then in the code:
def run_workflow(provider, language, workflow):
lang_data = LANGUAGE_MAP[language]()
if workflow == 'ready':
url = API + data.ready
elif workflow == 'unverified':
url = API + data.unverified
response = requests.post(url, headers=header, data=json.dumps(conversation))
Of course workflow can be wrapped in a similar way if there are more than 2 possible values.
Analogously for provider. Unless the action depends on both provider and language at the same time in which case you need a double map:
LANG_PROV_MAP = {
('en', 'xxxx'): first,
('ru', 'yyyy'): second,
}
def run_workflow(provider, language, workflow):
data = LANG_PROV_MAP[(provider, language)]()
...
The original code can be simplified with a tricky decorator:
LANGUAGE_MAP = {}
def language_handler(lang):
def wrapper(fn):
LANGUAGE_MAP[lang] = fn
return fn
return wrapper
#language_handler('en')
def handler():
return LanguageData(1,2,3,4)
#language_handler('ru')
def handler():
return LanguageData(5,6,7,8)
Also note that if the data is "constant" (i.e. doesn't depend on the context) then you can completely omit callables to make everything even simplier:
LANGUAGE_MAP = {
'en': LanguageData(1,2,3,4),
'ru': LanguageData(5,6,7,8),
}
def run_workflow(provider, language, workflow):
data = LANGUAGE_MAP[language]
...
The combination of the language and provider can compose the method name and the call will be invoked dynamically.
Example:
import sys
def provider1_lang2():
pass
def provider2_lang4():
pass
# get the provider / lang and call the method dynamically
provider = 'provider2'
lang = 'lang4'
method_name = '{}_{}'.format(provider,lang)
method = getattr(sys.modules[__name__], method_name)
method()

Python PLY Yacc "syntax error"

Okay, so I'm trying to build a parser of my mini-language (obviously), and setting variables seems to be properly working. But as soon as Yacc comes across a function definition, it just gives me a syntax error, and a couple of EOF errors (which I know are from when Yacc has no remaining rules to set) and nothing else happens... Where did I go wrong?
Here's an example of the syntax I'm parsing:
$name = "John Doe"
$age = 72
$waterInOceans = 95.4
!testFunction {
}
Where the !testFunction { } section is defining a function (based off of the exclamation point). I don't know if that's going to be useful in debugging.
# The Lexer
import ply.lex as lex
tokens = ["MINUS", "SEPARATOR", "MODIFIER", "FUNCTION_NAME", "UNDEF_BLOCK", "VARIABLE_NAME", "EQUALS", "STRING", "FLOAT", "INNER_CONTENT", "ARGUMENTS", "INTEGER", "PLUS"]
def t_ARGUMENTS(t): # Finds arguments in calls and function definitions
r'\(.*\)'
t.value = t.value[1:-1] # strip parenthesis
t.value = t.value.split(" && ")
return t
def t_STRING(t): # finds strings
r'"\w.+"'
t.value = t.value[1:-1] # strips the quotation marks of the string
return t
def t_FLOAT(t): # finds floats
r'\d+.\d+'
t.value = float(t.value)
return t
def t_INTEGER(t):
r'\d+'
t.value = int(t.value)
return t
def t_VARIABLE_NAME(t):
r'\$\w*\b'
t.value = t.value[1:]
return t
def t_INNER_CONTENT(t):
r'\{\n.*\n\}|\{.*\}'
t.value = t.value[1:-1]
return t
def t_FUNCTION_NAME(t):
r'!\w+'
t.value = t.value[1:]
return t
t_ignore = r"\n|\t|\r"
t_EQUALS = r"\="
t_PLUS = r"\+"
t_MINUS = r"-"
t_MODIFIER = r"\."
t_SEPARATOR = r"\,"
t_UNDEF_BLOCK = r"\w+" # Any block of text that is left over and isn't assigned by the end (used by functions)
def t_error(t):
t.lexer.skip(1)
lex.lex()
#opened = open("example.zeq", "r")
#content = opened.read()
#opened.close()
#lex.input(content)
And then the Yacc half:
# The Yacc parser
import ply.yacc as yacc
import compiler # Get the compiler (tokenizer; compiler.py) which generates tokens
import sys
from os import system
##############
### IGNORE ###
tokens = compiler.tokens
#system("clear")
print("Executing "+sys.argv[1]+" |\n"+("-"*(len(sys.argv[1])+12)))
### IGNORE ###
##############
VARIABLES = {}
FUNCTIONS = {}
def p_assign(p): # Set new variable
'''assignment : VARIABLE_NAME EQUALS compound
| VARIABLE_NAME EQUALS STRING
| VARIABLE_NAME EQUALS INTEGER
| VARIABLE_NAME EQUALS FLOAT'''
#print("Setting '{}' to '{}'...".format(str(p[1]), str(p[3])))
VARIABLES[p[1]] = p[3]
def p_number(p): # Combines floats and integers into a blanket non-terminal for simplicity sakes
'''number : FLOAT
| INTEGER'''
p[0] = p[1]
def p_compound(p): # Complete the value *before* the variable is assigned!
'''compound : number PLUS number
| number MINUS number'''
type1 = type(p[1])
type2 = type(p[3])
operator = p[2]
if operator == "+":
p[0] = p[1] + p[3]
elif operator == "-":
p[0] = p[1] - p[3]
def p_undefined(p):
'''undefined : UNDEF_BLOCK'''
print("Undefined block")
def p_function(p):
'''function : FUNCTION_NAME INNER_CONTENT'''
print("Creating a function")
name = p[1]
content = p[2]
FUNCTIONS[name] = content
def p_empty(p):
'''empty : '''
#~ def p_error(p):
#~ if p:
#~ print("Syntax error: "+p.type)
#~ else:
#~ pass
parser = yacc.yacc()
opened = open(sys.argv[1], "r")
content = opened.read()
opened.close()
for line in content.splitlines():
parser.parse(line)
print(VARIABLES)
print(FUNCTIONS)
I'm waiting for it to be a simple overlooked detail...
When you ask Ply (or yacc, for that matter) to parse an input, it attempts to recognize a single instance of the top-level non-terminal (or "starting symbol"). This will usually a grammatical description of the entire input, so it will often have a name like program, although there are use cases in which it is useful to parse just a part of the input.
Ply (and yacc) assume that the first grammar production is for the starting symbol. In your case, the first production is assignment, and so that is what it will try to parse (and nothing else). assignment cannot derive a function definition or any other statement type, so those cause syntax errors.
If you want to explicitly tell Ply what the top-level symbol is, you can do so. See the manual section on starting symbols.

How to parse optional and named arguments into list and dict?

I would like a compact way to parse one-line strings that start with mandatory list-elements (unspecified number) and ends with dictionary-like definitions using =.
The element-separator should be , and spaces should become part of the element -- which rules out shlex, I think.
Spaces should/may be stripped at the start and end (quotes, too)
If an element would contain a , the user is required to quote with "
either "key=value,with,comma"
or key="value,with,comma" -- whatever is easier to implement
It's ok to have undefined behavior with wrong quoting or with elements containing a quote-char.
Behaviour with double keys is also undefined.
Slight variations of this are ok if it simplifies the implementation a lot.
Lets call the function opts and have it return a list and a dict,
Here are some input examples and desired results:
opts('dog,cat') # -> ["dog", "cat"], {}
opts('big fish,cat') # -> ["big fish", "cat"], {}
opts('"a dog, a cat",a fish') # -> ["a dog, a cat", "a fish"], {}
opts('key=value') # -> [] {'key':'value'}
opts('key=the value,x=y') # -> [] {'key':'the value', 'x':'y'}
opts('dog, big fish, eats="any, but peas", flies = no! '
# -> ['dog','big fish'], {'eats':'any, but peas', 'flies':'no!' }
I disregarded shlex, argparse, optparse and configparser, I can't see how I should do it with those. I am not sure if Regular Expressions crack this nut, though. json is a bit too strict with the syntax, I think. As is eval, if a bit more to my liking (because it parses python ;-))
My manual solution in macro is not very flexible and I would like to have its parameter handling be replaced by the more general opts(s) function described above:
def macro(s):
kw = { 'see':u"\\see", 'type':u"Chapter", 'title': u'??' }
params = s.split(",")
kw['label'] = params[0]
if len(params) > 1: # very inflexible
kw['title'] = params[1]
for param in params[2:]: # wrong if p[1] is already key=value
key, value = param.split("=",1) # doesn't handle anything, too simple
kw[key] = value
# ...rest of code...
The goal is to have the reusable function opts to be used here:
def macro_see(s):
ls, kw = opts(s)
# ...rest of code...
In this solution, opts is essentially the same as yuvi's (with the added strip). The splitter is a customization of shlex, using posix mode to handle quotes.
def mylex(x):
lex = shlex.shlex(x, posix=True)
lex.whitespace = ','
lex.whitespace_split = True
return list(lex)
def opts(x):
ll = []
dd = {}
items = mylex(x)
for item in items:
if '=' in item:
k, v = item.split('=',1)
dd[k.strip(' "')] = v.strip(' "')
else:
ll.append(item.strip(' "'))
return (ll,dd)
It passes:
trials = [
['dog,cat',(["dog", "cat"], {})],
['big fish,cat',(["big fish", "cat"], {})],
['"a dog, a cat",a fish',(["a dog, a cat", "a fish"], {})],
['key=value',([], {'key':'value'})],
['key=the value,x=y',([], {'key':'the value', 'x':'y'})],
['dog, big fish, eats="any, but peas", flies = no!',(['dog','big fish'], {'eats':'any, but peas', 'flies':'no!' })],
]
for (x,y) in trials:
print('%r'%x)
args = opts(x)
print(args)
if args != y:
print('error, %r'%y)
print('')
What you probably want is to create your own split function, with a flag that toggles when " are introduced. Something like this:
def my_split(string, deli):
res = []
flag = True
start = 0
for i, c in enumerate(string):
if c == '"':
if flag:
flag = False
else:
flag = True
if c == deli and flag:
res.append(string[start:i])
start = i+1
res.append(string[start:])
return res
From there, it's really easy to proceed:
def opts(s):
items = map(lambda x: x.strip(), my_split(s, ','))
# collect
ls = []
kw = {}
for item in items:
if '=' in item:
k, v = item.split('=', 1)
kw[k.strip()] = v.strip()
else:
ls.append(item)
return ls, kw
It's not perfect, there are still a few thing you might need to work on, but that's definetly a start.
Here's an approach where I massage the input so it matches the syntax requirements for python function arguments, then harness the python interpreter via eval to parse them.
import re
s = 'hog, "cog" , dog, bog, "big fish", eats="any, but peas", flies = "no!" '
# I think this will add quotes around any unquoted positional arguments
s = re.sub('(^|,)\ *([^\"\',\ ]+)\ *(?=,|$)', r'\1"\2"', s)
def f(*args, **kwargs):
return (args, kwargs)
print eval("f("+s+")", {'f':f})
output:
(('hog', 'cog', 'dog', 'bog', 'big fish'), {'flies': 'no!', 'eats': 'any, but peas'})

Recursive parsing for lisp like syntax

im trying to parse lines in the form:
(OP something something (OP something something ) ) ( OP something something )
Where OP is a symbol for a logical gate (AND, OR, NOT) and something is the thing i want to evaluate.
The output im looking for is something like:
{ 'OPERATOR': [condition1, condition2, .. , conditionN] }
Where a condition itself can be a dict/list pair itself (nested conditions). So far i tried something like:
tree = dict()
cond = list()
tree[OP] = cond
for string in conditions:
self.counter += 1
if string.startswith('('):
try:
OP = string[1]
except IndexError:
OP = 'AND'
finally:
if OP == '?':
OP = 'OR'
elif OP == '!':
OP = 'N'
# Recurse
cond.append(self.parse_conditions(conditions[self.counter:], OP))
break
elif not string.endswith(")"):
cond.append(string)
else:
return tree
return tree
I tried other ways aswell but i just can't wrap my head around this whole recursion thing so im wondering if i could get some pointers here, i looked around the web and i found some stuff about recursive descent parsing but the tutorials were all trying to do something more complicated than i needed.
PS: i realize i could do this with existing python libraries but what would i learn by doing that eh?
I'm posting this without further comments, for learning purposes (in the real life please do use a library). Note that there's no error checking (a homework for you!)
Feel free to ask if there's something you don't understand.
# PART 1. The Lexer
symbols = None
def read(input):
global symbols
import re
symbols = re.findall(r'\w+|[()]', input)
def getsym():
global symbols
return symbols[0] if symbols else None
def popsym():
global symbols
return symbols.pop(0)
# PART 2. The Parser
# Built upon the following grammar:
#
# program = expr*
# expr = '(' func args ')'
# func = AND|OR|NOT
# args = arg*
# arg = string|expr
# string = [a..z]
def program():
r = []
while getsym():
r.append(expr())
return r
def expr():
popsym() # (
f = func()
a = args()
popsym() # )
return {f: a}
def func():
return popsym()
def args():
r = []
while getsym() != ')':
r.append(arg())
return r
def arg():
if getsym() == '(':
return expr()
return string()
def string():
return popsym()
# TEST = Lexer + Parser
def parse(input):
read(input)
return program()
print parse('(AND a b (OR c d)) (NOT foo) (AND (OR x y))')
# [{'AND': ['a', 'b', {'OR': ['c', 'd']}]}, {'NOT': ['foo']}, {'AND': [{'OR': ['x', 'y']}]}]

Categories