Regular Expression for single line comments - python

What is the regular expression for single line java comments:
I am trying the following grammar :
def single_comment(t):
r'\/\/.~(\n)'
#r'//.*$'
pass
but, I am unable to ignore the single line comments how can I do it?

Python regular expression for matching single line comments (only matches comments that start with //, not /* */). Unfortunately, this regular expression is pretty ugly as it has to account for escaped characters and // within strings. You should find a more easily understandable solution if you ever need this in real code.
import re
pattern = re.compile(r'^(?:[^"/\\]|\"(?:[^\"\\]|\\.)*\"|/(?:[^/"\\]|\\.)|/\"(?:[^\"\\]|\\.)*\"|\\.)*//(.*)$')
This is a little script that runs a bunch of test strings against the pattern.
import re
pattern = re.compile(r'^(?:[^"/\\]|\"(?:[^\"\\]|\\.)*\"|/(?:[^/"\\]|\\.)|/\"(?:[^\"\\]|\\.)*\"|\\.)*//(.*)$')
tests = [
(r'// hello world', True),
(r' // hello world', True),
(r'hello world', False),
(r'System.out.println("Hello, World!\n"); // prints hello world', True),
(r'String url = "http://www.example.com"', False),
(r'// hello world', True),
(r'//\\', True),
(r'// "some comment"', True),
(r'new URI("http://www.google.com")', False),
(r'System.out.println("Escaped quote\""); // Comment', True)
]
tests_passed = 0
for test in tests:
match = pattern.match(test[0])
has_comment = match != None
if has_comment == test[1]:
tests_passed += 1
print "Passed {0}/{1} tests".format(tests_passed, len(tests))

I think this works (using pyparsing):
data = """
class HelloWorld {
// method main(): ALWAYS the APPLICATION entry point
public static void main (String[] args) {
System.out.println("Hello World!"); // Nested //Print 'Hello World!'
System.out.println("http://www.example.com"); // Another nested // Print a URL
System.out.println("\"http://www.example.com"); // A nested escaped quote // Print another URL
}
}"""
from pyparsing import *
from pprint import pprint
dbls = QuotedString('"', '\\', '"')
sgls = QuotedString("'", '\\', "'")
strings = dbls | sgls
pprint(dblSlashComment.ignore(strings).searchString(data).asList())
[['// method main(): ALWAYS the APPLICATION entry point'],
["// Nested //Print 'Hello World!'"],
['// Another nested // Print a URL'],
['// A nested escaped quote // Print another URL']]
Should you have /* ... */ style comments, that happen to have single line comments in them, and don't actually want those, then you can use:
pprint(dblSlashComment.ignore(strings | cStyleComment).searchString(data).asList())
(as discussed in https://chat.stackoverflow.com/rooms/26267/discussion-between-nhahtdh-and-martega)

Related

Preserving indentation in a triple-quoted fstring

I am trying to use a triple-quoted strings in Python3(.7) to build some formated strings.
I have a list of inner strings, which all need to be tabbed in:
This is some text
across multiple
lines.
And a string which should contain the inner string
data{
// string goes here
}
I cannot tab the inner string when I create it. So, my thought was to use dedent with Python3 triple-quoted fstrings:
import textwrap
inner_str = textwrap.dedent(
'''\
This is some text
across multiple
lines.'''
)
full_str = textwrap.dedent(
f'''\
data{{
// This should all be tabbed
{inner_str}
}}'''
)
print(full_str)
However, the indentation is not maintained:
data{
// This should all be tabbed
This is some text
across multiple
lines.
}
The desired result:
data{
// This should all be tabbed
This is some text
across multiple
lines.
}
How can I preserve the indentation of the fstring without pre-tabbing the inner string?
This seems to provide what you want.
import textwrap
inner_str = textwrap.dedent(
'''\
This is some text
across multiple
lines.'''
)
full_str = textwrap.dedent(
f'''
data{{
{textwrap.indent(inner_str, " ")}
}}'''
)
A better solution:
idt = str.maketrans({'\n': "\n "})
print(textwrap.dedent(
f'''
data{{
{inner_str.translate(idt)}
}}'''
))
Another solution with customized tab width:
def indent_inner(inner_str, indent):
return inner_str.replace('\n', '\n' + indent) # os.linesep could be used if the function is needed across different OSs
print(textwrap.dedent(
f'''
data{{
{indent_inner(inner_str, " ")}
}}'''
))
None of the answers here seemed to do what I want, so I'm answering my own question with a solution which gets as close to what I was looking for as I can make. It is not required to pre-tab the data to define its indentation level, nor is it required to not indent the line (which breaks readability). Instead, the indentation level of the current line in the fstring is passed at usage time.
Not perfect but it works.
import textwrap
def code_indent(text, tab_sz, tab_chr=' '):
def indented_lines():
for i, line in enumerate(text.splitlines(True)):
yield (
tab_chr * tab_sz + line if line.strip() else line
) if i else line
return ''.join(indented_lines())
inner_str = textwrap.dedent(
'''\
This is some text
across multiple
lines.'''
)
full_str = textwrap.dedent(
f'''
data{{
{code_indent(inner_str, 8)}
}}'''
)
print(full_str)
Edited to avoid tabbing inner_str.
import textwrap
line_tab = '\n\t'
inner_str = f'''\
This is some text
across multiple
lines.
'''
full_str = textwrap.dedent(f'''\
data{{
// This should all be tabbed
{line_tab.join(inner_str.splitlines())}
}}''')
)
print(full_str)
Output:
data{
// This should all be tabbed
This is some text
across multiple
lines.
}

Find function calls with python using Regular expressions

I am working with a language where the modules are defined as
<module_name> <inst_name>(.<port_name> (<net_name>)….);
or
module1 inst1 ( .input a,
.output b;
port b=a;);
I want to find all such modules, while ignoring function calls .
I'm having difficulty with regex. I am looking for this
text1 text2 ( .text3; text4 );
note that all the spaces except the ones between text 1 and text2 are optional and might be new lines instead of spaces.text 3 and text4 can be multi lines but all are in the form of
text3 - >
.blah1 (blah2),
.blah3 (blah4)
text4->
blah1 blah2=xyz;
blah3 blah4=qwe;
I am trying to do
re.split(r"^[a-zA-Z]*\s[a-zA-Z]*\s?\n?\([a-zA-Z]*\s?\n?;[a-zA-Z]*\);", data)
Doesn't work though.It just grabs everything. How do i fix it? Thanks !!
I do need to grab everything individually, eventually (module/instances/port/nets). I think I can split it once regex is working.
I think you need to write a parser that understands enough of the language to at least canonicalize it before you try extracting information. You could write a simple parser by hand, or you could use a parsing framework such as PLY or others of that ilk.
To give you a more concrete idea about what I'm suggesting, consider
the following code, which defines a parse_data function that, given
the contents of a file, will yield a series of tokens recognized in
that file:
import re
tokens = {
'lparen': '\(',
'rparen': '\)',
'comma': ',',
'semicolon': ';',
'whitespace': '\s+',
'equals': '=',
'identifier': '[.\d\w]+',
}
tokens = dict((k, re.compile(v)) for k,v in tokens.items())
def parse_data(data):
while data:
for tn, tv in tokens.items():
mo = tv.match(data)
if mo:
matched = data[mo.start():mo.end()]
data = data[mo.end():]
yield tn, matched
Using this, you could write something that would put your sample input
into canonical form:
with open('inputfile') as fd:
data = fd.read()
last_token = (None, None)
for tn, tv in parse(data):
if tn == 'whitespace' and last_token[0] != 'semicolon':
print ' ',
elif tn == 'whitespace':
pass
elif tn == 'semicolon' and last_token[0] == 'rparen':
print tv
else:
print tv,
last_token = (tn, tv)
Given input like this:
module1 inst1 ( .input a,
.output b;
port b=a;);
module2 inst2 ( .input a, .output b; port b=a;);
module3 inst3 ( .input a, .output b;
port b=a;);
The above code would yield:
module1 inst1 ( .input a , .output b ; port b = a ; ) ;
module2 inst2 ( .input a , .output b ; port b = a ; ) ;
module3 inst3 ( .input a , .output b ; port b = a ; ) ;
Which, because it is in standard form, would be much more amendable to
extracting information via simple pattern matching.
Note that while this code relies on reading the entire source file
into memory first, you could fairly easily write code that you parse a
file in fragments if you were concerned about memory utilization.

Search txt file and display data between all occurrences

Extremely new to Python. I started off writing a PHP script to find all occurrences of 2 strings in a txt file but it's using too much memory so i read Python would be better.
Basically what i need to do is:
- import a txt file
- go through it and return all the data between the tags below
- remove any duplicates
- output the results
The bits i'm looking for look like this:
------DATA--------------------------------------------------
DATA TO SHOW
------------------------------------------------------------
Naturally, the important bit to output is the DATA TO SHOW part.
Any help would be appreciated :)
Thanks
UPDATE -----------------------
import re
inputFile = open("small.txt", "r")
output = open("result.txt", "w")
searchStart = "----- ASSERT --------------------------------------------------------------------------------"
searchEnd = "---------------------------------------------------------------------------------------------"
match = re.findall('^----- ASSERT --------------------------------------------------------------------------------\n(.*?)---------------------------------------------------------------------------------------------', inputFile.read(), re.MULTILINE)
print match,
Any ideas how to get it to show all the lines until it hits the searchEnd tag? Example data:
----- ASSERT --------------------------------------------------------------------------------
MORE
INFO
THAT
I
NEED
TO
GET
FROM
THE
FILE
---------------------------------------------------------------------------------------------
An example with php (not tested, the idea is here) :
$handle = fopen("inputfile.txt", "r");
if ($handle) {
$record = false;
while (($line = fgets($handle)) !== false) {
if ($line == '------DATA--------------------------------------------------') {
$record = true;
$temp = '';
} elseif ($record) {
if ($line == '------------------------------------------------------------') {
$record = false;
$results[] = $temp;
$temp = '';
} else $temp .= $line;
}
}
} else {
echo 'Gargoyl, the file can\'t be opened!';
}
fclose($handle);
print_r($results);

Python convert C header file to dict

I have a C header file which contains a series of classes, and I'm trying to write a function which will take those classes, and convert them to a python dict. A sample of the file is down the bottom.
Format would be something like
class CFGFunctions {
class ABC {
class AA {
file = "abc/aa/functions"
class myFuncName{ recompile = 1; };
};
class BB
{
file = "abc/bb/functions"
class funcName{
recompile=1;
}
}
};
};
I'm hoping to turn it into something like
{CFGFunctions:{ABC:{AA:"myFuncName"}, BB:...}}
# Or
{CFGFunctions:{ABC:{AA:{myFuncName:"string or list or something"}, BB:...}}}
In the end, I'm aiming to get the filepath string (which is actually a path to a folder... but anyway), and the class names in the same class as the file/folder path.
I've had a look on SO, and google and so on, but most things I've found have been about splitting lines into dicts, rather then n-deep 'blocks'
I know I'll have to loop through the file, however, I'm not sure the most efficient way to convert it to the dict.
I'm thinking I'd need to grab the outside class and its relevant brackets, then do the same for the text remaining inside.
If none of that makes sense, it's cause I haven't quite made sense of the process myself haha
If any more info is needed, I'm happy to provide.
The following code is a quick mockup of what I'm sorta thinking...
It is most likely BROKEN and probably does NOT WORK. but its sort of the process that I'm thinking of
def get_data():
fh = open('CFGFunctions.h', 'r')
data = {} # will contain final data model
# would probably refactor some of this into a function to allow better looping
start = "" # starting class name
brackets = 0 # number of brackets
text= "" # temp storage for lines inside block while looping
for line in fh:
# find the class (start
mt = re.match(r'Class ([\w_]+) {', line)
if mt:
if start == "":
start = mt.group(1)
else:
# once we have the first class, find all other open brackets
mt = re.match(r'{', line)
if mt:
# and inc our counter
brackets += 1
mt2 = re.match(r'}', line)
if mt2:
# find the close, and decrement
brackets -= 1
# if we are back to the initial block, break out of the loop
if brackets == 0:
break
text += line
data[start] = {'tempText': text}
====
Sample file
class CfgFunctions {
class ABC {
class Control {
file = "abc\abc_sys_1\Modules\functions";
class assignTracker {
description = "";
recompile = 1;
};
class modulePlaceMarker {
description = "";
recompile = 1;
};
};
class Devices
{
file = "abc\abc_sys_1\devices\functions";
class registerDevice { recompile = 1; };
class getDeviceSettings { recompile = 1; };
class openDevice { recompile = 1; };
};
};
};
EDIT:
If possible, if I have to use a package, I'd like to have it in the programs directory, not the general python libs directory.
As you detected, parsing is necessary to do the conversion. Have a look at the package PyParsing, which is a fairly easy-to-use library to implement parsing in your Python program.
Edit: This is a very symbolic version of what it would take to recognize a very minimalistic grammer - somewhat like the example at the top of the question. It won't work, but it might put you in the right direction:
from pyparsing import ZeroOrMore, OneOrMore, \
Keyword, Literal
test_code = """
class CFGFunctions {
class ABC {
class AA {
file = "abc/aa/functions"
class myFuncName{ recompile = 1; };
};
class BB
{
file = "abc/bb/functions"
class funcName{
recompile=1;
}
}
};
};
"""
class_tkn = Keyword('class')
lbrace_tkn = Literal('{')
rbrace_tkn = Literal('}')
semicolon_tkn = Keyword(';')
assign_tkn = Keyword(';')
class_block = ( class_tkn + identifier + lbrace_tkn + \
OneOrMore(class_block | ZeroOrMore(assignment)) + \
rbrace_tkn + semicolon_tkn \
)
def test_parser(test):
try:
results = class_block.parseString(test)
print test, ' -> ', results
except ParseException, s:
print "Syntax error:", s
def main():
test_parser(test_code)
return 0
if __name__ == '__main__':
main()
Also, this code is only the parser - it does not generate any output. As you can see in the PyParsing docs, you can later add the actions you want. But the first step would be to recognize the what you want to translate.
And a last note: Do not underestimate the complexities of parsing code... Even with a library like PyParsing, which takes care of much of the work, there are many ways to get mired in infinite loops and other amenities of parsing. Implement things step-by-step!
EDIT: A few sources for information on PyParsing are:
http://werc.engr.uaf.edu/~ken/doc/python-pyparsing/HowToUsePyparsing.html
http://pyparsing.wikispaces.com/
(Particularly interesting is http://pyparsing.wikispaces.com/Publications, with a long list of articles - several of them introductory - on PyParsing)
http://pypi.python.org/pypi/pyparsing_helper is a GUI for debugging parsers
There is also a 'tag' Pyparsing here on stackoverflow, Where Paul McGuire (the PyParsing author) seems to be a frequent guest.
* NOTE: *
From PaulMcG in the comments below: Pyparsing is no longer hosted on wikispaces.com. Go to github.com/pyparsing/pyparsing

Converting comment // with /* with Python re

In my C program, I should convert every "// comment" with "/* comment */"
I thought of the code below with Python re, but could't come up with any idea of how to
insert */ before at the end of the line.
regexp = re.compile(r'//', re.MULTILINE)
c = regexp.sub('/*',c)
Any idea will be grateful. Thank you.
Here's a start for a (IMO) "better" solution. It uses regex to scan through the input and match not only single line comments, but also those tokens that can contain //:
#!/usr/bin/env python
import re
import sys
source = """
/*
* not
// a single line
// comment
*/
// include stuff
#include<stdio.h>
main()
{
// comments with a '*/' !
printf("Again, no single // line comment"); // comment
}
"""
pattern = re.compile(r""" (//([^\r\n]*)) # match a single line comment
| "[^"]*" # match a string literal
| /\*.*?\*/ # match a multi line comment
| . # fall through: match any char
"""
, re.X | re.S)
print("%s" % source)
print('=' * 40)
for match in pattern.finditer(source):
if match.group(1):
# we found a single line comment
s = match.group(2).replace('*/', '* /')
sys.stdout.write("/*" + s + " */")
else:
# something other than a single line comment, just print it
sys.stdout.write(match.group())
which will print:
/*
* not
// a single line
// comment
*/
// include stuff
#include<stdio.h>
main()
{
// comments with a '*/' !
printf("Again, no single // line comment"); // comment
}
========================================
/*
* not
// a single line
// comment
*/
/* include stuff */
#include<stdio.h>
main()
{
/* comments with a '* /' ! */
printf("Again, no single // line comment"); /* comment */
}
Adapted from a previous post that was looking for single line comments in Java, you can use pyparsing:
data = """
class HelloWorld {
// method main(): ALWAYS the APPLICATION entry point
public static void main (String[] args) {
System.out.println("Hello World!"); // Nested //Print 'Hello World!'
System.out.println("http://www.example.com"); // Another nested // Print a URL
System.out.println("\"http://www.example.com"); // A nested escaped quote // Print another URL
}
}"""
from pyparsing import *
dbls = QuotedString('"', '\\', '"')
sgls = QuotedString("'", '\\', "'")
strings = dbls | sgls
g = dblSlashComment.ignore(strings)
g.setParseAction(lambda L,: '/*' + L[0].lstrip('//') + '*/')
print g.transformString(data)
Outputs:
class HelloWorld {
/* method main(): ALWAYS the APPLICATION entry point*/
public static void main (String[] args) {
System.out.println("Hello World!"); /* Nested //Print 'Hello World!'*/
System.out.println("http://www.example.com"); /* Another nested // Print a URL*/
System.out.println(""http://www.example.com"); /* A nested escaped quote // Print another URL*/
}
}

Categories