I'd like to create a regular expression in Python that will match against a line in Python source code and return a list of function calls.
The typical line would look like this:
something = a.b.method(time.time(), var=1) + q.y(x.m())
and the result should be:
["a.b.method()", "time.time()", "q.y()", "x.m()"]
I have two problems here:
creating the correct pattern
the catch groups are overlapping
thank you for help
I don't think regular expressions is the best approach here. Consider the ast module instead, for example:
class ParseCall(ast.NodeVisitor):
def __init__(self):
self.ls = []
def visit_Attribute(self, node):
ast.NodeVisitor.generic_visit(self, node)
self.ls.append(node.attr)
def visit_Name(self, node):
self.ls.append(node.id)
class FindFuncs(ast.NodeVisitor):
def visit_Call(self, node):
p = ParseCall()
p.visit(node.func)
print ".".join(p.ls)
ast.NodeVisitor.generic_visit(self, node)
code = 'something = a.b.method(foo() + xtime.time(), var=1) + q.y(x.m())'
tree = ast.parse(code)
FindFuncs().visit(tree)
result
a.b.method
foo
xtime.time
q.y
x.m
$ python3
>>> import re
>>> from itertools import chain
>>> def fun(s, r):
... t = re.sub(r'\([^()]+\)', '()', s)
... m = re.findall(r'[\w.]+\(\)', t)
... t = re.sub(r'[\w.]+\(\)', '', t)
... if m==r:
... return
... for i in chain(m, fun(t, m)):
... yield i
...
>>> list(fun('something = a.b.method(time.time(), var=1) + q.y(x.m())', []))
['time.time()', 'x.m()', 'a.b.method()', 'q.y()']
/([.a-zA-Z]+)\(/g
should match the method names; you'd have to add the parens after since you have some nested.
I don't really know Python, but I can imagine that making this work properly involves some complications, eg:
strings
comments
expressions that return an object
But for your example, an expression like this works:
(?:\w+\.)+\w+\(
I have an example for you proving this is doable in Python3
import re
def parse_func_with_params(inp):
func_params_limiter = ","
func_current_param = func_params_adder = "\s*([a-z-A-Z]+)\s*"
try:
func_name = "([a-z-A-Z]+)\s*"
p = re.compile(func_name + "\(" + func_current_param + "\)")
print(p.match(inp).groups())
except:
while 1:
func_current_param += func_params_limiter + func_params_adder
try:
func_name = "([a-z-A-Z]+)\s*"
p = re.compile(func_name + "\(" + func_current_param + "\)")
print(p.match(inp).groups())
break
except:
pass
Command line Input: animalFunc(lion, tiger, giraffe, singe)
Output: ('animalFunc', 'lion', 'tiger', 'giraffe', 'singe')
As you see the function name is always the first in the list and the rest are the paramaters names passed
Related
I have a text file a.txt. I want to perform some preprocess on it like remove punct. and split it into words.
I have written the following code to perform few operations.
class pre:
def __init__(self,textfilepath):
self.textfilepath = textfilepath
def __str__(self,textfilepath):
return str(textfilepath)
def process(textpathfile):
with open(textpathfile, r) as abc:
a = abc.translate(string.maketrans("",""), string.punctuation)
a = a.split(' ')
return a
pre("a.txt")
I tried executing it.But it gave an error pre doesn't take arguments. Can any one help me with how to do this? Thanks all.
You shouldn't pass arguments to __str__. Instead you can access them through the properties of self:
class pre:
def __init__(self,textfilepath):
self.textfilepath = textfilepath
def __str__(self):
return self.textfilepath
def process(self):
with open(self.textfilepath, r) as abc:
a = abc.translate(string.maketrans("",""), string.punctuation)
a = a.split(' ')
return a
p = pre("a.txt")
print(p)
filedata = p.process()
print(filedata)
I would like to create a list of all the functions used in a code file. For example if we have following code in a file named 'add_random.py'
`
import numpy as np
from numpy import linalg
def foo():
print np.random.rand(4) + np.random.randn(4)
print linalg.norm(np.random.rand(4))
`
I would like to extract the following list:
[numpy.random.rand, np.random.randn, np.linalg.norm, np.random.rand]
The list contains the functions used in the code with their actual name in the form of 'module.submodule.function'. Is there something built in python language that can help me do this?
You can extract all call expressions with:
import ast
class CallCollector(ast.NodeVisitor):
def __init__(self):
self.calls = []
self.current = None
def visit_Call(self, node):
# new call, trace the function expression
self.current = ''
self.visit(node.func)
self.calls.append(self.current)
self.current = None
def generic_visit(self, node):
if self.current is not None:
print "warning: {} node in function expression not supported".format(
node.__class__.__name__)
super(CallCollector, self).generic_visit(node)
# record the func expression
def visit_Name(self, node):
if self.current is None:
return
self.current += node.id
def visit_Attribute(self, node):
if self.current is None:
self.generic_visit(node)
self.visit(node.value)
self.current += '.' + node.attr
Use this with a ast parse tree:
tree = ast.parse(yoursource)
cc = CallCollector()
cc.visit(tree)
print cc.calls
Demo:
>>> tree = ast.parse('''\
... def foo():
... print np.random.rand(4) + np.random.randn(4)
... print linalg.norm(np.random.rand(4))
... ''')
>>> cc = CallCollector()
>>> cc.visit(tree)
>>> cc.calls
['np.random.rand', 'np.random.randn', 'linalg.norm']
The above walker only handles names and attributes; if you need more complex expression support, you'll have to extend this.
Note that collecting names like this is not a trivial task. Any indirection would not be handled. You could build a dictionary in your code of functions to call and dynamically swap out function objects, and static analysis like the above won't be able to track it.
In general, this problem is undecidable, consider for example getattribute(random, "random")().
If you want static analysis, the best there is now is jedi
If you accept dynamic solutions, then cover coverage is your best friend. It will show all used functions, rather than only directly referenced though.
Finally you can always roll your own dynamic instrumentation along the lines of:
import random
import logging
class Proxy(object):
def __getattr__(self, name):
logging.debug("tried to use random.%s", name)
return getattribute(_random, name)
_random = random
random = Proxy()
im trying to parse lines in the form:
(OP something something (OP something something ) ) ( OP something something )
Where OP is a symbol for a logical gate (AND, OR, NOT) and something is the thing i want to evaluate.
The output im looking for is something like:
{ 'OPERATOR': [condition1, condition2, .. , conditionN] }
Where a condition itself can be a dict/list pair itself (nested conditions). So far i tried something like:
tree = dict()
cond = list()
tree[OP] = cond
for string in conditions:
self.counter += 1
if string.startswith('('):
try:
OP = string[1]
except IndexError:
OP = 'AND'
finally:
if OP == '?':
OP = 'OR'
elif OP == '!':
OP = 'N'
# Recurse
cond.append(self.parse_conditions(conditions[self.counter:], OP))
break
elif not string.endswith(")"):
cond.append(string)
else:
return tree
return tree
I tried other ways aswell but i just can't wrap my head around this whole recursion thing so im wondering if i could get some pointers here, i looked around the web and i found some stuff about recursive descent parsing but the tutorials were all trying to do something more complicated than i needed.
PS: i realize i could do this with existing python libraries but what would i learn by doing that eh?
I'm posting this without further comments, for learning purposes (in the real life please do use a library). Note that there's no error checking (a homework for you!)
Feel free to ask if there's something you don't understand.
# PART 1. The Lexer
symbols = None
def read(input):
global symbols
import re
symbols = re.findall(r'\w+|[()]', input)
def getsym():
global symbols
return symbols[0] if symbols else None
def popsym():
global symbols
return symbols.pop(0)
# PART 2. The Parser
# Built upon the following grammar:
#
# program = expr*
# expr = '(' func args ')'
# func = AND|OR|NOT
# args = arg*
# arg = string|expr
# string = [a..z]
def program():
r = []
while getsym():
r.append(expr())
return r
def expr():
popsym() # (
f = func()
a = args()
popsym() # )
return {f: a}
def func():
return popsym()
def args():
r = []
while getsym() != ')':
r.append(arg())
return r
def arg():
if getsym() == '(':
return expr()
return string()
def string():
return popsym()
# TEST = Lexer + Parser
def parse(input):
read(input)
return program()
print parse('(AND a b (OR c d)) (NOT foo) (AND (OR x y))')
# [{'AND': ['a', 'b', {'OR': ['c', 'd']}]}, {'NOT': ['foo']}, {'AND': [{'OR': ['x', 'y']}]}]
I'm trying to use RegEx within Python to parse out a function definition and NOTHING else. I keep running into problems though. Is RegEx the right tool to be using here?
i.e.
def foo():
print bar
-- Matches --
a = 2
def foo():
print bar
-- Doesn't match as there's code above the def --
def foo():
print bar
a = 2
-- Doesn't match as there's code below the def --
An example of a string I'm trying to parse is "def isPalindrome(x):\n return x == x[::-1]". But in reality that might contain lines above or below the def itself.
What RegEx expression would I have to use to achieve this?
No, regular expressions are not the right tool for this job. This is similar to people desperately trying to parse HTML with regular expressions. These languages are not regular. Thus you can't work around all quirks you will encounter.
Use the built-in parser module, build a parse tree, check for definition nodes and use them instead. It's even better to use the ast module as it is way more convenient to use. An example:
import ast
mdef = 'def foo(x): return 2*x'
a = ast.parse(mdef)
definitions = [n for n in ast.walk(a) if type(n) == ast.FunctionDef]
reg = re.compile('((^ *)def \w+\(.*?\): *\r?\n'
'(?: *\r?\n)*'
'\\2( +)[^ ].*\r?\n'
'(?: *\r?\n)*'
'(\\2\\3.*\r?\n(?: *\r?\n)*)*)',
re.MULTILINE)
EDIT
import re
script = '''
def foo():
print bar
a = 2
def foot():
print bar
b = 10
"""
opopo =457
def foor(x):
print bar
print x + 10
def g(u):
print
def h(rt,o):
assert(rt==12)
a = 2
class AZERT(object):
pass
"""
b = 10
def tabulae(x):
\tprint bar
\tprint x + 10
\tdef g(u):
\t\tprint
\tdef h(rt,o):
\t\tassert(rt==12)
a = 2
class Z:
def inzide(x):
print baracuda
print x + 10
def gululu(u):
print
def hortense(rt,o):
assert(rt==12)
def oneline(x): return 2*x
def scroutchibi(h%,n():245sqfg srot b#
'''
.
reg = re.compile('((?:^[ \t]*)def \w+\(.*\): *(?=.*?[^ \t\n]).*\r?\n)'
'|'
'((^[ \t]*)def \w+\(.*\): *\r?\n'
'(?:[ \t]*\r?\n)*'
'\\3([ \t]+)[^ \t].*\r?\n'
'(?:[ \t]*\r?\n)*'
'(\\3\\4.*\r?\n(?: *\r?\n)*)*)',
re.MULTILINE)
regcom = re.compile('("""|\'\'\')(.+?)\\1',re.DOTALL)
avoided_spans = [ma.span(2) for ma in regcom.finditer(script)]
print 'eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee'
for ma in reg.finditer(script):
print ma.group(),
print '--------------------'
print repr(ma.group())
print
try:
exec(ma.group().strip())
except:
print " isn't a valid definition of a function"
am,bm = ma.span()
if any(a<=am<=bm<=b for a,b in avoided_spans):
print ' is a commented definition function'
print 'eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee'
result
eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
def foo():
print bar
--------------------
'def foo():\n print bar\n\n'
eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
def foot():
print bar
--------------------
'def foot():\n print bar\n\n'
eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
def foor(x):
print bar
print x + 10
def g(u):
print
def h(rt,o):
assert(rt==12)
--------------------
'def foor(x):\n\n\n print bar\n print x + 10\n def g(u):\n print\n\n def h(rt,o):\n assert(rt==12)\n'
is a commented definition function
eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
def tabulae(x):
print bar
print x + 10
def g(u):
print
def h(rt,o):
assert(rt==12)
--------------------
'def tabulae(x):\n\n\n\tprint bar\n\tprint x + 10\n\tdef g(u):\n\t\tprint\n\n\tdef h(rt,o):\n\t\tassert(rt==12)\n'
eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
def inzide(x):
print baracuda
print x + 10
def gululu(u):
print
def hortense(rt,o):
assert(rt==12)
--------------------
' def inzide(x):\n\n\n print baracuda\n print x + 10\n def gululu(u):\n print\n\n def hortense(rt,o):\n assert(rt==12)\n\n\n\n'
eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
def oneline(x): return 2*x
--------------------
'def oneline(x): return 2*x\n'
eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
def scroutchibi(h%,n():245sqfg srot b#
--------------------
'def scroutchibi(h%,n():245sqfg srot b#\n'
isn't a valid definition of a function
eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
I'm trying to rewrite the equivalent of the python replace() function without using regexp. Using this code, i've managed to get it to work with single chars, but not with more than one character:
def Replacer(self, find_char, replace_char):
s = []
for char in self.base_string:
if char == find_char:
char = replace_char
#print char
s.append(char)
s = ''.join(s)
my_string.Replacer('a','E')
Anybody have any pointers how to make this work with more than one character? example:
my_string.Replacer('kl', 'lll')
How clever are you trying to be?
def Replacer(self, find, replace):
return(replace.join(self.split(find)))
>>> Replacer('adding to dingoes gives diamonds','di','omg')
'adomgng to omgngoes gives omgamonds'
Here is a method that should be pretty efficient:
def replacer(self, old, new):
return ''.join(self._replacer(old, new))
def _replacer(self, old, new):
oldlen = len(old)
i = 0
idx = self.base_string.find(old)
while idx != -1:
yield self.base_string[i:idx]
yield new
i = idx + oldlen
idx = self.base_string.find(old, i)
yield self.base_string[i:]
Let's try with some slices (but you really should consider using the builtin method of python) :
class ReplacableString:
def __init__(self, base_string):
self.base_string =base_string
def replacer(self, to_replace, replacer):
for i in xrange(len(self.base_string)):
if to_replace == self.base_string[i:i+len(to_replace)]:
self.base_string = self.base_string[:i] + replacer + self.base_string[i+len(to_replace):]
def __str__(self):
return str(self.base_string)
test_str = ReplacableString("This is eth string")
test_str.replacer("eth", "the")
print test_str
>>> This is the string