python: Lark-parser - python

I use the Lark library to parse boolean expressions like
(A=(value1) OR B>(value2)) AND C<=(value3)
and I also use it to parse keyless expressions like
(A OR B) AND C
the parser works correctly until I try to parse a keyless expression containing not only letters and numbers but other characters as well
valu* OR text
in this case the parser crashes with the exception that it does not recognise the character
Although if you use a key expression, it works correctly
A=(valu* OR text) AND B=(value2)
Please tell me where I have a bug in my code and how it can be fixed.
Code:
from lark import Lark, Tree, Token
rules = """
?start: expr
?expr: link_or
?link_or: (link_or "or"i)? link_and
?link_and: (link_and "and"i)? ( NAME | cond_eq | cond_gt | cond_ge | cond_lt | cond_le )
?cond_eq: KEY "=" const | "(" expr ")"
?cond_gt: KEY ">" const | "(" expr ")"
?cond_ge: KEY ">=" const | "(" expr ")"
?cond_lt: KEY "<" const | "(" expr ")"
?cond_le: KEY "<=" const | "(" expr ")"
KEY: NAME
?const: INT -> int
| string_raw -> string
?string_raw: /\((?:[^)(]+|\((?:[^)(]+|\([^)(]*\))*\))*\)/
%import common.CNAME -> NAME
%import common.WS_INLINE
%import common.INT
%ignore WS_INLINE
"""
parser = Lark(rules)
for text in ("key1=(value1*) OR key2=(value2)", "key1 OR key2", "key1* OR key2"):
print("text:", text)
try:
tree = parser.parse(text)
print("parsed tree:", tree)
except BaseException as e:
print("Exception:", e)
print()
Output
example #1: "key1=(value1*) OR key2=(value2)": OK
text: key1=(value1*) OR key2=(value2)
parsed tree: Tree(Token('RULE', 'link_or'), [Tree(Token('RULE', 'cond_eq'), [Token('KEY', 'key1'), Tree('string', [Token('__ANON_2', '(value1*)')])]), Tree(Token('RULE', 'cond_eq'), [Token('KEY', 'key2'), Tree('string', [Token('__ANON_2', '(value2)')])])])
example #2: "key1 OR key2": OK
text: key1 OR key2
parsed tree: Tree(Token('RULE', 'link_or'), [Token('NAME', 'key1'), Token('NAME', 'key2')])
example #3: "key1* OR key2": FAILED
text: key1* OR key2
Exception: No terminal matches '*' in the current parser context, at line 1 col 5
key1* OR key2
^
Expected one of:
* __ANON_1
* OR
* LESSTHAN
* AND
* MORETHAN
* __ANON_0
* EQUAL
example #3 without try except block:
Traceback (most recent call last):
File "D:\codes\python\test.py", line 34, in <module>
tree = parser.parse(text)
File "C:\Program Files\Python\lib\site-packages\lark\lark.py", line 645, in parse
return self.parser.parse(text, start=start, on_error=on_error)
File "C:\Program Files\Python\lib\site-packages\lark\parser_frontends.py", line 96, in parse
return self.parser.parse(stream, chosen_start, **kw)
File "C:\Program Files\Python\lib\site-packages\lark\parsers\earley.py", line 266, in parse
to_scan = self._parse(lexer, columns, to_scan, start_symbol)
File "C:\Program Files\Python\lib\site-packages\lark\parsers\xearley.py", line 146, in _parse
to_scan = scan(i, to_scan)
File "C:\Program Files\Python\lib\site-packages\lark\parsers\xearley.py", line 119, in scan
raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan},
lark.exceptions.UnexpectedCharacters: No terminal matches '*' in the current parser context, at line 1 col 5
key1* OR key2
^
Expected one of:
* __ANON_1
* __ANON_0
* LESSTHAN
* MORETHAN
* AND
* OR
* EQUAL

Related

I am trying to extract sequences from a file, but getting following error

Code to extract sequences
from Bio import SeqIO
def get_cds_feature_with_qualifier_value(seq_record, name, value):
for feature in genome_record.features:
if feature.type == "CDS" and value in feature.qualifiers.get(name, []):
return feature
return None
genome_record = SeqIO.read("470.8208.gbk", "genbank")
db_xref = ['fig|470.8208.peg.2198', 'fig|470.8208.peg.2200', 'fig|470.8208.peg.2203', 'fig|470.8208.peg.2199', 'fig|470.8208.peg.2201', 'fig|470.8208.peg.2197', 'fig|470.8208.peg.2202', 'fig|470.8208.peg.2501', 'fig|470.8208.peg.2643', 'fig|470.8208.peg.2193', 'fig|470.8208.peg.2670', 'fig|470.8208.peg.2695', 'fig|470.8208.peg.2696', 'fig|470.8208.peg.2189', 'fig|470.8208.peg.2458', 'fig|470.8208.peg.2191', 'fig|470.8208.peg.2190', 'fig|470.8208.peg.2188', 'fig|470.8208.peg.2192', 'fig|470.8208.peg.2639', 'fig|470.8208.peg.3215', 'fig|470.8208.peg.2633', 'fig|470.8208.peg.2682', 'fig|470.8208.peg.3186', 'fig|470.8208.peg.2632', 'fig|470.8208.peg.2683', 'fig|470.8208.peg.3187', 'fig|470.8208.peg.2764', 'fig|470.8208.peg.2686', 'fig|470.8208.peg.2638', 'fig|470.8208.peg.2680', 'fig|470.8208.peg.2685', 'fig|470.8208.peg.2684', 'fig|470.8208.peg.2633', 'fig|470.8208.peg.2682', 'fig|470.8208.peg.3186', 'fig|470.8208.peg.2632', 'fig|470.8208.peg.2683', 'fig|470.8208.peg.3187', 'fig|470.8208.peg.2640', 'fig|470.8208.peg.3221', 'fig|470.8208.peg.3222', 'fig|470.8208.peg.3389', 'fig|470.8208.peg.2764', 'fig|470.8208.peg.2653', 'fig|470.8208.peg.3216', 'fig|470.8208.peg.3231', 'fig|470.8208.peg.2641', 'fig|470.8208.peg.2638', 'fig|470.8208.peg.2680', 'fig|470.8208.peg.2637', 'fig|470.8208.peg.2642', 'fig|470.8208.peg.2679', 'fig|470.8208.peg.3230', 'fig|470.8208.peg.2676', 'fig|470.8208.peg.2677', 'fig|470.8208.peg.1238', 'fig|470.8208.peg.2478', 'fig|470.8208.peg.2639', 'fig|470.8208.peg.854', 'fig|470.8208.peg.382', 'fig|470.8208.peg.383']
with open("nucleotides.fasta", "w") as nt_output, open("proteins.fasta", "w") as aa_output:
for xref in db_xref:
print ("Looking at " + xref)
cds_feature = get_cds_feature_with_qualifier_value (genome_record, "db_xref", xref)
gene_sequence = cds_feature.extract(genome_record.seq)
protein_sequence = gene_sequence.translate(table=11, cds=True)
# This is asking Python to halt if the translation does not match:
assert protein_sequence == cds_feature.qualifiers["translation"][0]
# Output FASTA records - note \n means insert a new line.
# This is a little lazy as it won't line wrap the sequence:
nt_output.write(">%s\n%s\n" % (xref, gene_sequence))
aa_output.write(">%s\n%s\n" % (xref, gene_sequence))
print("Done")
getting following error
/usr/local/lib/python3.7/dist-packages/Bio/GenBank/Scanner.py:1394: BiopythonParserWarning: Truncated LOCUS line found - is this correct?
:'LOCUS CP027704 3430798 bp DNA linear UNK \n'
BiopythonParserWarning,
Looking at fig|470.8208.peg.2198
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-32-323ff320990a> in <module>()
15 print ("Looking at " + xref)
16 cds_feature = get_cds_feature_with_qualifier_value (genome_record, "db_xref", xref)
---> 17 gene_sequence = cds_feature.extract(genome_record.seq)
18 protein_sequence = gene_sequence.translate(table=11, cds=True)
19
AttributeError: 'NoneType' object has no attribute 'extract'
You have a space between the get_cds_feature_with_qualifier_value call and its parameters (genome_record, "db_xref", xref), so the call is probably failing, leaving cds_feature as None.
Generally, you should provide reproducible examples so that someone else (who doesn't have the gbk file you're referencing) can still reproduce and troubleshoot your error.
Solved the problem.
'''
from Bio import SeqIO
def get_cds_feature_with_qualifier_value(seq_record, name, value):
for feature in genome_record.features:
if feature.type == "CDS" and value in feature.qualifiers.get(name, []):
return feature
return None
genome_record = SeqIO.read("470.8208.gbk", "genbank")
da_xref = ['fig|470.8208.peg.2198', 'fig|470.8208.peg.2200', 'fig|470.8208.peg.2203', 'fig|470.8208.peg.2199', 'fig|470.8208.peg.2201', 'fig|470.8208.peg.2197', 'fig|470.8208.peg.2202', 'fig|470.8208.peg.2501', 'fig|470.8208.peg.2643', 'fig|470.8208.peg.2193', 'fig|470.8208.peg.2670', 'fig|470.8208.peg.2695', 'fig|470.8208.peg.2696', 'fig|470.8208.peg.2189', 'fig|470.8208.peg.2458', 'fig|470.8208.peg.2191', 'fig|470.8208.peg.2190', 'fig|470.8208.peg.2188', 'fig|470.8208.peg.2192', 'fig|470.8208.peg.2639', 'fig|470.8208.peg.3215', 'fig|470.8208.peg.2633', 'fig|470.8208.peg.2682', 'fig|470.8208.peg.3186', 'fig|470.8208.peg.2632', 'fig|470.8208.peg.2683', 'fig|470.8208.peg.3187', 'fig|470.8208.peg.2764', 'fig|470.8208.peg.2686', 'fig|470.8208.peg.2638', 'fig|470.8208.peg.2680', 'fig|470.8208.peg.2685', 'fig|470.8208.peg.2684', 'fig|470.8208.peg.2633', 'fig|470.8208.peg.2682', 'fig|470.8208.peg.3186', 'fig|470.8208.peg.2632', 'fig|470.8208.peg.2683', 'fig|470.8208.peg.3187', 'fig|470.8208.peg.2640', 'fig|470.8208.peg.3221', 'fig|470.8208.peg.3222', 'fig|470.8208.peg.3389', 'fig|470.8208.peg.2764', 'fig|470.8208.peg.2653', 'fig|470.8208.peg.3216', 'fig|470.8208.peg.3231', 'fig|470.8208.peg.2641', 'fig|470.8208.peg.2638', 'fig|470.8208.peg.2680', 'fig|470.8208.peg.2637', 'fig|470.8208.peg.2642', 'fig|470.8208.peg.2679', 'fig|470.8208.peg.3230', 'fig|470.8208.peg.2676', 'fig|470.8208.peg.2677', 'fig|470.8208.peg.1238', 'fig|470.8208.peg.2478', 'fig|470.8208.peg.2639', 'fig|470.8208.peg.854', 'fig|470.8208.peg.382', 'fig|470.8208.peg.383']
db_xref=[]
for xref in da_xref:
db_xref.append('SEED:' + xref)
with open("nucleotides.fasta", "w") as nt_output, open("proteins.fasta", "w") as aa_output:
for xref in db_xref:
print ("Looking at", xref)
cds_feature = get_cds_feature_with_qualifier_value(genome_record, "db_xref", xref)
gene_sequence = cds_feature.extract(genome_record.seq)
protein_sequence = gene_sequence.translate(table=11, cds=True)
# This is asking Python to halt if the translation does not match:
assert protein_sequence == cds_feature.qualifiers["translation"][0]
# Output FASTA records - note \n means insert a new line.
# This is a little lazy as it won't line wrap the sequence:
nt_output.write(">%s\n%s\n" % (xref, gene_sequence))
aa_output.write(">%s\n%s\n" % (xref, gene_sequence))
print("Done")
'''

RecursionError: maximum recursion depth exceeded while using lark in python

I've written the decaf grammar specified in cs143 course.
Here is my code.
import sys
from lark import Lark, Transformer, v_args
decaf_grammar = r"""
start : PROGRAM
PROGRAM : DECL+
DECL : VARIABLEDECL | FUNCTIONDECL | CLASSDECL | INTERFACEDECL
VARIABLEDECL : VARIABLE ";"
VARIABLE : TYPE "ident"
TYPE : "int" | "double" | "bool" | "string" | "ident" | TYPE "[]"
FUNCTIONDECL : ( TYPE "ident" "(" FORMALS ")" STMTBLOCK ) | ( "void" "ident" "(" FORMALS ")" STMTBLOCK )
FORMALS : VARIABLE ("," VARIABLE)*
CLASSDECL : "class" "ident" ["extends" "ident"] ["implements" "ident" ("," "ident")*] "{" FIELD* "}"
FIELD : VARIABLEDECL | FUNCTIONDECL
INTERFACEDECL : "interface" "ident" "{" PROTOTYPE* "}"
PROTOTYPE : (TYPE "ident" "(" FORMALS ")" ";") | ("void" "ident" "(" FORMALS ")" ";")
STMTBLOCK : "{" VARIABLEDECL* STMT* "}"
STMT : ( EXPR? ";") | IFSTMT | WHILESTMT | FORSTMT | BREAKSTMT | RETURNSTMT | RETURNSTMT | PRINTSTMT | STMTBLOCK
IFSTMT : "if" "(" EXPR ")" STMT ["else" STMT]
WHILESTMT : "while" "(" EXPR ")" STMT
FORSTMT : "for" "(" EXPR? ";" EXPR ";" EXPR? ")" STMT
RETURNSTMT : "return" EXPR? ";"
BREAKSTMT : "break" ";"
PRINTSTMT : "print" "(" EXPR ("," EXPR)* ")" ";"
EXPR : (LVALUE "=" EXPR) | CONSTANT | LVALUE | "this" | CALL | "(" EXPR ")" | (EXPR "+" EXPR) | (EXPR "-" EXPR) | (EXPR "*" EXPR) | (EXPR "/" EXPR) | (EXPR "%" EXPR) | ("-" EXPR) | (EXPR "<" EXPR) | (EXPR "<=" EXPR) | (EXPR ">" EXPR) | (EXPR ">=" EXPR) | (EXPR "==" EXPR) | (EXPR "!=" EXPR) | (EXPR "&&" EXPR) | (EXPR "||" EXPR) | ("!" EXPR) | ("ReadInteger" "(" ")") | ("ReadLine" "(" ")") | ("new" "ident") | ("NewArray" "(" EXPR "," TYPE ")")
LVALUE : "ident" | (EXPR "." "ident") | (EXPR "[" EXPR "]")
CALL : ("ident" "(" ACTUALS ")") | (EXPR "." "ident" "(" ACTUALS ")")
ACTUALS : EXPR ("," EXPR)* | ""
CONSTANT : "intConstant" | "doubleConstant" | "boolConstant" | "stringConstant" | "null"
"""
class TreeToJson(Transformer):
#v_args(inline=True)
def string(self, s):
return s[1:-1].replace('\\"', '"')
json_parser = Lark(decaf_grammar, parser='lalr', lexer='standard', transformer=TreeToJson())
parse = json_parser.parse
def test():
test_json = '''
{
}
'''
j = parse(test_json)
print(j)
import json
assert j == json.loads(test_json)
if __name__ == '__main__':
test()
#with open(sys.argv[1]) as f:
#print(parse(f.read()))
It throws
RecursionError: maximum recursion depth exceeded.
I'm using lark for the first time
The problem you have is that you don't feel the difference between lark's rules and terminals. Terminals (they are only should be named in capitals) should match string, not structure of your grammar.
The main terminal's property you must support is that they, unlike rules, are not "recursive". Because of that lark struggle to build your grammar and goes to infinite recursion and stackoverflow.
try using sys.setrecursionlimit(xxxx) where xxxx is max recursion depth you want.
To know more visit docs.python.org/3 .

How to find and replace case sensitive whole words in python

Consider the below mcve:
import re
import textwrap
import traceback
import unittest
def replace_words(content, replacements):
rc = re.compile(r"[A-Za-z_]\w*")
def translate(match):
word = match.group(0)
return replacements.get(word, word)
return rc.sub(translate, content, re.IGNORECASE | re.MULTILINE)
class class_name(unittest.TestCase):
def setUp(self):
self.replacements = [
{
'PLUS': '"+"',
'DASH': '"-"',
'BANG': '"!"',
'TILDE': '"~"',
'STAR': '"*"',
'SLASH': '"/"',
'PERCENT': '"%"',
'LEFT_PAREN': '"("',
'RIGHT_PAREN': '")"'
}, {
"IF": "fi",
"FOO": "oof",
"BAR": "rab",
"OP_FOO": "oof_op"
}
]
self.texts = [
textwrap.dedent("""\
variable_identifier :
IDENTIFIER
primary_expression :
foo1
foo2
foo3
LEFT_PAREN expression RIGHT_PAREN
unary_operator :
PLUS
DASH
BANG
TILDE
multiplicative_expression :
unary_expression
multiplicative_expression STAR unary_expression
multiplicative_expression SLASH unary_expression
multiplicative_expression PERCENT unary_expression\
"""),
textwrap.dedent("""\
IF identifier IDENTIFIER FOO BAR BARycentric
OP_FOO
""")
]
self.expected_results = [
textwrap.dedent("""\
variable_identifier :
IDENTIFIER
primary_expression :
foo1
foo2
foo3
"(" expression ")"
unary_operator :
"+"
"-"
"!"
"~"
multiplicative_expression :
unary_expression
multiplicative_expression "*" unary_expression
multiplicative_expression "/" unary_expression
multiplicative_expression "%" unary_expression\
"""),
textwrap.dedent("""\
fi identifier IDENTIFIER oof rab BARycentric
oof_op
""")
]
def _tester(self, f):
replacements = self.replacements
expected_results = self.expected_results
texts = self.texts
self.assertEqual(f(texts[0], replacements[0]), expected_results[0])
self.assertEqual(f(texts[1], replacements[1]), expected_results[1])
def test_replace_words(self):
self._tester(replace_words)
if __name__ == "__main__":
unittest.main()
replace_words function is attempting to search and replace case sensitive whole words in a given text using a dictionary of replacements above code but it will fail in the line self.assertEqual(f(texts[0], replacements[0]), expected_results[0]) though and I don't know why.
So the question would be, how do you find and replace case sensitive whole words using a replacements dictionary in python?
You can use re.sub and re.findall:
import re
def regex_string(d, to_lower = False):
if not to_lower:
return '|'.join(r'\b{}\b'.format(i) for i in d.keys())
return '|'.join([c for b in [[r'\b{}\b'.format(i.lower()), r'\b{}\b'.format(i)] for i in d.keys()] for c in b])
replacements = {
'PLUS': '"+"',
'DASH': '"-"',
'BANG': '"!"',
'TILDE': '"~"',
'STAR': '"*"',
'SLASH': '"/"',
'PERCENT': '"%"',
'LEFT_PAREN': '"("',
'RIGHT_PAREN': '")"'
}
replaced = re.sub(regex_string(replacements, True), '{}', content)
final_result = replaced.format(*[replacements.get(i, i) for i in re.findall(regex_string(replacements, True), content)])
Output (case 1):
variable_identifier :
IDENTIFIER
primary_expression :
foo1
foo2
foo3
"(" expression ")"
unary_operator :
"+"
"-"
"!"
"~"
multiplicative_expression :
unary_expression
multiplicative_expression "*" unary_expression
multiplicative_expression "/" unary_expression
multiplicative_expression "%" unary_expression
Output (case 2):
fi identifier IDENTIFIER oof rab BARycentric
oof_op
Or, even shorter:
replaced = re.sub(regex_string(replacements, True), lambda x:replacements.get(x.group(), x.group()), content)

PyParsing: parseaction called multiple

I am a beginner with pyparsing but have experience with other parsing environments.
On my first small demo project I encountered a strange behavior of parsing actions: Parse action of base token (ident_simple) is called twice for each token of ident_simple.
import io, sys
from pyparsing import *
def pa_ident_simple(s, l, t):
print('ident_simple: ' + str(t))
def pa_ident_combined(s, l, t):
print('ident_combined: ' + str(t))
def make_grammar():
number = Word(nums)
ident_simple = Word( alphas, alphanums + "_" )
ident_simple.setParseAction(pa_ident_simple)
ident_combined = Combine(ident_simple + Literal('.') + ident_simple)
ident_combined.setParseAction(pa_ident_combined)
integer = number
elems = ( ident_combined | ident_simple | integer)
grammar = OneOrMore(elems) + StringEnd()
return grammar
if __name__ == "__main__":
inp_str = "UUU FFF.XXX GGG"
grammar = make_grammar()
print (inp_str, "--->", grammar.parseString( inp_str ))
For 'ident_combined' token it looks good: Parseaction is called once for each sub token 'ident_simple' and once for combined token.
I believe that the combined token is the problem: Parseaction of 'ident_simple' is called only once if 'ident_combined' is removed.
Can anybody give me a hint how to combine tokens correctly?
Thanks for any help
Update: When playing around I took the class "Or" instead of "MatchFirst".
elems = ( ident_combined ^ ident_simple ^ integer)
This showed a better behavior (in my opinion).
Output of original grammar (using "MatchFirst"):
ident_simple: ['UUU']
ident_simple: ['UUU']
ident_simple: ['FFF']
ident_simple: ['XXX']
ident_combined: ['FFF.XXX']
ident_simple: ['GGG']
ident_simple: ['GGG']
UUU FFF.XXX GGG ---> ['UUU', 'FFF.XXX', 'GGG']
Output of modified grammar (using "Or"):
ident_simple: ['UUU']
ident_simple: ['FFF']
ident_simple: ['XXX']
ident_combined: ['FFF.XXX']
ident_simple: ['GGG']
UUU FFF.XXX GGG ---> ['UUU', 'FFF.XXX', 'GGG']

Pyparsing error when evaluating WFF logic expressions?

I'm new to Python and pyparsing, and I'm making a logic expression evaluator.
The formula must be a WFF. The BNF of WFF is:
<alpha set> ::= p | q | r | s | t | u | ...
(the arbitrary finite set of propositional variables)
<form> ::= <alpha set> | ¬<form> | (<form>V<form>) | (<form>^<form>)
| (<form> -> <form>) | (<form> <-> <form>)
My code is:
'''
Created on 17/02/2012
#author: Juanjo
'''
from pyparsing import *
from string import lowercase
def fbf():
atom = Word(lowercase, max=1) #aphabet
op = oneOf('^ V => <=>') #Operators
identOp = oneOf('( [ {')
identCl = oneOf(') ] }')
form = Forward() #Iniciar de manera recursiva
#Grammar:
form << ( (Group(Literal('~') + form)) | ( Group(identOp + form + op + form + identCl) ) | ( Group(identOp + form + identCl) ) | (atom) )
return form
entrada = raw_input("Input please: ") #userinput
print fbf().parseString(entrada)
The problem is when I use these expressions: a^b and aVb.
The parser should return an error, but there's no error; instead it returns a. Actually, any symbol after a will be ignored.
The WFF version of those forms are: (a^b) and (aVb)
Both work correctly. I think the problem is in the atom definition.
What am I doing wrong?
By default parseString will just parse the beginning of the string.
You can force it to parse the entire string by changing the code to:
print fbf().parseString(entrada, parseAll=True)
Alternatively, you can end the grammar with the StringEnd() token - see the documentation under parseString in http://packages.python.org/pyparsing/ for more details.

Categories