Consider the below mcve:
import re
import textwrap
import traceback
import unittest
def replace_words(content, replacements):
rc = re.compile(r"[A-Za-z_]\w*")
def translate(match):
word = match.group(0)
return replacements.get(word, word)
return rc.sub(translate, content, re.IGNORECASE | re.MULTILINE)
class class_name(unittest.TestCase):
def setUp(self):
self.replacements = [
{
'PLUS': '"+"',
'DASH': '"-"',
'BANG': '"!"',
'TILDE': '"~"',
'STAR': '"*"',
'SLASH': '"/"',
'PERCENT': '"%"',
'LEFT_PAREN': '"("',
'RIGHT_PAREN': '")"'
}, {
"IF": "fi",
"FOO": "oof",
"BAR": "rab",
"OP_FOO": "oof_op"
}
]
self.texts = [
textwrap.dedent("""\
variable_identifier :
IDENTIFIER
primary_expression :
foo1
foo2
foo3
LEFT_PAREN expression RIGHT_PAREN
unary_operator :
PLUS
DASH
BANG
TILDE
multiplicative_expression :
unary_expression
multiplicative_expression STAR unary_expression
multiplicative_expression SLASH unary_expression
multiplicative_expression PERCENT unary_expression\
"""),
textwrap.dedent("""\
IF identifier IDENTIFIER FOO BAR BARycentric
OP_FOO
""")
]
self.expected_results = [
textwrap.dedent("""\
variable_identifier :
IDENTIFIER
primary_expression :
foo1
foo2
foo3
"(" expression ")"
unary_operator :
"+"
"-"
"!"
"~"
multiplicative_expression :
unary_expression
multiplicative_expression "*" unary_expression
multiplicative_expression "/" unary_expression
multiplicative_expression "%" unary_expression\
"""),
textwrap.dedent("""\
fi identifier IDENTIFIER oof rab BARycentric
oof_op
""")
]
def _tester(self, f):
replacements = self.replacements
expected_results = self.expected_results
texts = self.texts
self.assertEqual(f(texts[0], replacements[0]), expected_results[0])
self.assertEqual(f(texts[1], replacements[1]), expected_results[1])
def test_replace_words(self):
self._tester(replace_words)
if __name__ == "__main__":
unittest.main()
replace_words function is attempting to search and replace case sensitive whole words in a given text using a dictionary of replacements above code but it will fail in the line self.assertEqual(f(texts[0], replacements[0]), expected_results[0]) though and I don't know why.
So the question would be, how do you find and replace case sensitive whole words using a replacements dictionary in python?
You can use re.sub and re.findall:
import re
def regex_string(d, to_lower = False):
if not to_lower:
return '|'.join(r'\b{}\b'.format(i) for i in d.keys())
return '|'.join([c for b in [[r'\b{}\b'.format(i.lower()), r'\b{}\b'.format(i)] for i in d.keys()] for c in b])
replacements = {
'PLUS': '"+"',
'DASH': '"-"',
'BANG': '"!"',
'TILDE': '"~"',
'STAR': '"*"',
'SLASH': '"/"',
'PERCENT': '"%"',
'LEFT_PAREN': '"("',
'RIGHT_PAREN': '")"'
}
replaced = re.sub(regex_string(replacements, True), '{}', content)
final_result = replaced.format(*[replacements.get(i, i) for i in re.findall(regex_string(replacements, True), content)])
Output (case 1):
variable_identifier :
IDENTIFIER
primary_expression :
foo1
foo2
foo3
"(" expression ")"
unary_operator :
"+"
"-"
"!"
"~"
multiplicative_expression :
unary_expression
multiplicative_expression "*" unary_expression
multiplicative_expression "/" unary_expression
multiplicative_expression "%" unary_expression
Output (case 2):
fi identifier IDENTIFIER oof rab BARycentric
oof_op
Or, even shorter:
replaced = re.sub(regex_string(replacements, True), lambda x:replacements.get(x.group(), x.group()), content)
Related
I have the following column which consists of email subject headers:
Subject
EXT || Transport enquiry
EXT || RE: EXTERNAL: RE: 0001 || Copy of enquiry
EXT || FW: Model - Jan
SV: [EXTERNAL] Calculations
What I want to achieve is:
Subject
Transport enquiry
0001 || Copy of enquiry
Model - Jan
Calculations
and for this I am using the below code which only takes into account the first regular expression that I am passing and ignoring the rest
def clean_subject_prelim(text):
text = re.sub(r'^EXT \|\| $' , '' , text)
text = re.sub(r'EXT \|\| RE: EXTERNAL: RE:', '' , text)
text = re.sub(r'EXT \|\| FW:', '' , text)
text = re.sub(r'^SV: \[EXTERNAL]$' , '' , text)
return text
df['subject_clean'] = df['Subject'].apply(lambda x: clean_subject_prelim(x))
Why this is not working, what am I missing here?
You can use
pattern = r"""(?mx) # MULTILINE mode on
^ # start of string
(?: # non-capturing group start
EXT\s*\|\|\s*(?:RE:\s*EXTERNAL:\s*RE:|FW:)? # EXT || or EXT || RE: EXTERNAL: RE: or EXT || FW:
| # or
SV:\s*\[EXTERNAL]# SV: [EXTERNAL]
) # non-capturing group end
\s* # zero or more whitespaces
"""
df['subject_clean'] = df['Subject'].str.replace(pattern', '', regex=True)
See the regex demo.
Since the re.X ((?x)) is used, you should escape literal spaces and # chars, or just use \s* or \s+ to match zero/one or more whitespaces.
Get rid of the $ sign in the first expression and switch some of regex expressions from place. Like this:
import pandas as pd
import re
def clean_subject_prelim(text):
text = re.sub(r'EXT \|\| RE: EXTERNAL: RE:', '' , text)
text = re.sub(r'EXT \|\| FW:', '' , text)
text = re.sub(r'^EXT \|\|' , '' , text)
text = re.sub(r'^SV: \[EXTERNAL]' , '' , text)
return text
data = {"Subject": [
"EXT || Transport enquiry",
"EXT || RE: EXTERNAL: RE: 0001 || Copy of enquiry",
"EXT || FW: Model - Jan",
"SV: [EXTERNAL] Calculations"]}
df = pd.DataFrame(data)
df['subject_clean'] = df['Subject'].apply(lambda x: clean_subject_prelim(x))
I'm trying to remove trademark symbol (™) but only in the case it's not followed by any other symbol for instance I might have ’ which is a bad encoding of quotation mark (') so I don't want to remove trademark symbol (™) and hence broking the pattern that i'm using to replace xx™ with quotation mark.
dict = {};
chars = {
'\xe2\x84\xa2': '', # ™
'\xe2\x80\x99': "'", # ’
}
def stats_change(char, number):
if dict.has_key(char):
dict[char] = dict[char]+number
else:
dict[char] = number # Add new entry
def replace_chars(match):
char = match.group(0)
stats_change(char,1)
return chars[char]
i, nmatches = re.subn("(\\" + '|\\'.join(chars.keys()) + ")", replace_chars, i)
count_matches += nmatches
Input: foo™ oof
Output: foo oof
Input: o’f oof
Output: o'f oof
Any suggestions ?
I've written the decaf grammar specified in cs143 course.
Here is my code.
import sys
from lark import Lark, Transformer, v_args
decaf_grammar = r"""
start : PROGRAM
PROGRAM : DECL+
DECL : VARIABLEDECL | FUNCTIONDECL | CLASSDECL | INTERFACEDECL
VARIABLEDECL : VARIABLE ";"
VARIABLE : TYPE "ident"
TYPE : "int" | "double" | "bool" | "string" | "ident" | TYPE "[]"
FUNCTIONDECL : ( TYPE "ident" "(" FORMALS ")" STMTBLOCK ) | ( "void" "ident" "(" FORMALS ")" STMTBLOCK )
FORMALS : VARIABLE ("," VARIABLE)*
CLASSDECL : "class" "ident" ["extends" "ident"] ["implements" "ident" ("," "ident")*] "{" FIELD* "}"
FIELD : VARIABLEDECL | FUNCTIONDECL
INTERFACEDECL : "interface" "ident" "{" PROTOTYPE* "}"
PROTOTYPE : (TYPE "ident" "(" FORMALS ")" ";") | ("void" "ident" "(" FORMALS ")" ";")
STMTBLOCK : "{" VARIABLEDECL* STMT* "}"
STMT : ( EXPR? ";") | IFSTMT | WHILESTMT | FORSTMT | BREAKSTMT | RETURNSTMT | RETURNSTMT | PRINTSTMT | STMTBLOCK
IFSTMT : "if" "(" EXPR ")" STMT ["else" STMT]
WHILESTMT : "while" "(" EXPR ")" STMT
FORSTMT : "for" "(" EXPR? ";" EXPR ";" EXPR? ")" STMT
RETURNSTMT : "return" EXPR? ";"
BREAKSTMT : "break" ";"
PRINTSTMT : "print" "(" EXPR ("," EXPR)* ")" ";"
EXPR : (LVALUE "=" EXPR) | CONSTANT | LVALUE | "this" | CALL | "(" EXPR ")" | (EXPR "+" EXPR) | (EXPR "-" EXPR) | (EXPR "*" EXPR) | (EXPR "/" EXPR) | (EXPR "%" EXPR) | ("-" EXPR) | (EXPR "<" EXPR) | (EXPR "<=" EXPR) | (EXPR ">" EXPR) | (EXPR ">=" EXPR) | (EXPR "==" EXPR) | (EXPR "!=" EXPR) | (EXPR "&&" EXPR) | (EXPR "||" EXPR) | ("!" EXPR) | ("ReadInteger" "(" ")") | ("ReadLine" "(" ")") | ("new" "ident") | ("NewArray" "(" EXPR "," TYPE ")")
LVALUE : "ident" | (EXPR "." "ident") | (EXPR "[" EXPR "]")
CALL : ("ident" "(" ACTUALS ")") | (EXPR "." "ident" "(" ACTUALS ")")
ACTUALS : EXPR ("," EXPR)* | ""
CONSTANT : "intConstant" | "doubleConstant" | "boolConstant" | "stringConstant" | "null"
"""
class TreeToJson(Transformer):
#v_args(inline=True)
def string(self, s):
return s[1:-1].replace('\\"', '"')
json_parser = Lark(decaf_grammar, parser='lalr', lexer='standard', transformer=TreeToJson())
parse = json_parser.parse
def test():
test_json = '''
{
}
'''
j = parse(test_json)
print(j)
import json
assert j == json.loads(test_json)
if __name__ == '__main__':
test()
#with open(sys.argv[1]) as f:
#print(parse(f.read()))
It throws
RecursionError: maximum recursion depth exceeded.
I'm using lark for the first time
The problem you have is that you don't feel the difference between lark's rules and terminals. Terminals (they are only should be named in capitals) should match string, not structure of your grammar.
The main terminal's property you must support is that they, unlike rules, are not "recursive". Because of that lark struggle to build your grammar and goes to infinite recursion and stackoverflow.
try using sys.setrecursionlimit(xxxx) where xxxx is max recursion depth you want.
To know more visit docs.python.org/3 .
I need to replace some special characters from user input for different platform (i.e. Linux and Windows) using Python. Here is my code:
if request.method == 'POST':
rname1 = request.POST.get('react')
Here I am getting the user input by post method. I need to the following characters to remove from the user input (if there is any).
1- Escape or filter special characters for windows, ( ) < > * ‘ = ? ; [ ] ^ ~ ! . ” % # / \ : + , `
2- Escape or filter special characters for Linux, { } ( ) < > * ‘ = ? ; [ ] $ – # ~ ! . ” % / \ : + , `
The special characters are given above. Here I need to remove for both Linux and Windows.
Python strings have a built in method translate for substitution/deletion of characters. You need to build a translation table and then call the function.
import sys
if "win" in sys.platform:
special = """( ) < > * ‘ = ? ; [ ] ^ ~ ! . ” % # / \ : + , `""".split()
else:
special = """{ } ( ) < > * ‘ = ? ; [ ] $ – # ~ ! . ” % / \ : + , `""".split()
trans_dict = {character: None for character in special}
trans_table = str.maketrans(trans_dict)
print("Lo+=r?e~~m ipsum dol;or sit!! amet, consectet..ur ad%".translate(trans_table))
Will print Lorem ipsum dolor sit amet consectetur ad.
If you want to use a replacement character instead of deleting, then replace None above with the character. You can build a translation table with specific substitutions, `{"a": "m", "b": "n", ...}
Edit: The above snippet is indeed in Python3. In Python2 (TiO) it's easier to delete characters:
>>> import sys
>>> import string
>>> if "win" in sys.platform:
... special = """()<>*'=?;[]^~!%#/\:=,`"""
... else:
... special = """{}()<>*'=?;[]$-#~!."%/\:+"""
...
>>> s = "Lo+r?e~~/\#<>m ips()u;m"
>>> string.translate(s, None, special)
'Lorem ipsum'
Note that I've substituted ‘ with ' and similarly replaced ” with " because I think you're only dealing with ascii strings.
I am a beginner with pyparsing but have experience with other parsing environments.
On my first small demo project I encountered a strange behavior of parsing actions: Parse action of base token (ident_simple) is called twice for each token of ident_simple.
import io, sys
from pyparsing import *
def pa_ident_simple(s, l, t):
print('ident_simple: ' + str(t))
def pa_ident_combined(s, l, t):
print('ident_combined: ' + str(t))
def make_grammar():
number = Word(nums)
ident_simple = Word( alphas, alphanums + "_" )
ident_simple.setParseAction(pa_ident_simple)
ident_combined = Combine(ident_simple + Literal('.') + ident_simple)
ident_combined.setParseAction(pa_ident_combined)
integer = number
elems = ( ident_combined | ident_simple | integer)
grammar = OneOrMore(elems) + StringEnd()
return grammar
if __name__ == "__main__":
inp_str = "UUU FFF.XXX GGG"
grammar = make_grammar()
print (inp_str, "--->", grammar.parseString( inp_str ))
For 'ident_combined' token it looks good: Parseaction is called once for each sub token 'ident_simple' and once for combined token.
I believe that the combined token is the problem: Parseaction of 'ident_simple' is called only once if 'ident_combined' is removed.
Can anybody give me a hint how to combine tokens correctly?
Thanks for any help
Update: When playing around I took the class "Or" instead of "MatchFirst".
elems = ( ident_combined ^ ident_simple ^ integer)
This showed a better behavior (in my opinion).
Output of original grammar (using "MatchFirst"):
ident_simple: ['UUU']
ident_simple: ['UUU']
ident_simple: ['FFF']
ident_simple: ['XXX']
ident_combined: ['FFF.XXX']
ident_simple: ['GGG']
ident_simple: ['GGG']
UUU FFF.XXX GGG ---> ['UUU', 'FFF.XXX', 'GGG']
Output of modified grammar (using "Or"):
ident_simple: ['UUU']
ident_simple: ['FFF']
ident_simple: ['XXX']
ident_combined: ['FFF.XXX']
ident_simple: ['GGG']
UUU FFF.XXX GGG ---> ['UUU', 'FFF.XXX', 'GGG']