Using AST to change function names from CamelCase to snake_case - python

Here in this question, I was asking for a way to convert function names from CamelCase to snake_case, one of the comments suggested using AST.
I found a code snippet to find all function calls in a script
import ast
from collections import deque
class FuncCallVisitor(ast.NodeVisitor):
def __init__(self):
self._name = deque()
#property
def name(self):
return '.'.join(self._name)
#name.deleter
def name(self):
self._name.clear()
def visit_Name(self, node):
self._name.appendleft(node.id)
def visit_Attribute(self, node):
try:
self._name.appendleft(node.attr)
self._name.appendleft(node.value.id)
except AttributeError:
self.generic_visit(node)
def get_func_calls(tree):
func_calls = []
for node in ast.walk(tree):
if isinstance(node, ast.Call):
callvisitor = FuncCallVisitor()
callvisitor.visit(node.func)
func_calls.append(callvisitor.name)
return func_calls
if __name__ == '__main__':
tree = ast.parse(open("some_dir").read())
print(get_func_calls(tree))
using this code I have all function calls in my script, now I want to write a code that converts this name to snake_case.
I found this code snippet to modify a node in AST tree
class RewriteName(ast.NodeTransformer):
def visit_Name(self, node):
return ast.copy_location(ast.Subscript(
value=ast.Name(id='data', ctx=ast.Load()),
slice=ast.Index(value=ast.Str(s=node.id)),
ctx=node.ctx
), node)
tree = RewriteName().visit(tree)
I didn't understand how to use it to serve my purpose. Any explanation or other pieces of advice?

I am kind of late to this, but maybe it will be found in the future.
Anyway, here is a quick hack at it. Actually, you were almost there with your solution. The name method returns your name, then you can arbitrarily change that. So in your def get_func_calls(tree) call you can manipulate the string and re-assign the new name to the Call object.
ccName = callvisitor.name # work with some local var
new_name = '' # the new func name
for char_i in range(len(ccName)): # go over the name
if ccName[char_i].isupper(): # check if the current char is with uppercase
if ccName[char_i - 1] == '.': # check if the previous character is a dot
new_name += ccName[char_i].lower() # if it is, make the char to lowercase
else:
new_name += '_' + ccName[char_i].lower() # otherwise add the snake_
else:
new_name += ccName[char_i] # just add the rest of the lower chars
callvisitor._name = new_name # just re-asign the new name
func_calls.append(callvisitor._name)
This is definitely not a pretty solution and it also depends if you want to change only function definitions or every single function call in a file, but this should give you an idea on how to change the ast.

Related

ast python how to find connections between methods at different files

I want to index all methods and thee connections between them in an entire application (A directory with sub directories and files eventually). I'm using ast, looping over directories till individual files and then loads them into an ast object like so ast.parse(self.file_content)
The index that i'm trying to create is this
connection
Here is my code, if it's relevant.
def scan(self):
'''
scans a file line by line while keeping context of location and classes
indexes a file buffer content into a ast, Abstract Syntax Trees, https://en.wikipedia.org/wiki/AST.
Then, iterate over the elements, find relevant ones to index, and index them into the db.
'''
parsed_result = ast.parse(self.file_content)
for element in parsed_result.body:
results = self.index_element(element)
def index_element(self, element, class_name=None):
'''
if element is relevant, meaning method -> index
if element is Class -> recursively call it self
:param element:
:param class_name:
:return: [{insert_result: <db_insert_result>, 'structured_data': <method> object}, ...]
'''
# find classes
# find methods inside classes
# find hanging functions
# validation on element type
if self.should_index_element(element):
if self.is_class_definition(element):
class_element = element
indexed_items = []
for inner_element in class_element.body:
# recursive call
results = self.index_element(inner_element, class_name=class_element.name)
indexed_items += results
return indexed_items
else:
structured_data = self.structure_method_into_an_object(element, class_name=class_name)
result_graph = self.dal_client.methods_graph.insert_or_update(structured_data)
return "WhatEver"
return "WhatEver"
My question is, is it possible to create this graph using ast. If yes, how?
From my understanding, I currently can't since I'm loading one file at a time to the ast object and it is not aware of outside methods.
here is an example for 2 files that I want to link between them:
sample_a.py
from sample_class_b import SampleClassB
sample_b = SampleClassB()
class SampleClassA(object):
def __init__(self):
self.a = 1
def test_call_to_another_function(self):
return sample_b.test()
sample_b.py
class SampleClassB(object):
def __init__(self):
self.b = 1
def test(self):
return True
You can traverse the ast.Ast tree and at each recursive call do one of four things:
If the tree is a class definition, store the class name with its associated methods, and then apply Connections.walk to each of the methods, storing the class and method name in the scope.
If the tree is an import statement, load the module and recursively run Connections.walk on it.
If an attribute lookup is being made and Connections.walk is within a method, check if the attribute name is a method of any classes currently loaded. If so, add an edge to edges that links the current scope with this new method discovered.
If none of the above occurs, continue to traverse the tree.
import ast, itertools
import re, importlib
class Connections:
def __init__(self):
self._classes, self.edges = {}, []
def walk(self, tree, scope=None):
t_obj = None
if isinstance(tree, ast.ClassDef):
self._classes[tree.name] = [i for i in tree.body if isinstance(i, ast.FunctionDef) and not re.findall('__[a-z]+__', i.name)]
_ = [self.walk(i, [tree.name, i.name]) for i in self._classes[tree.name]]
t_obj = [i for i in tree.body if i not in self._classes[tree.name]]
elif isinstance(tree, (ast.Import, ast.ImportFrom)):
for p in [tree.module] if hasattr(tree, 'module') else [i.name for i in tree.names]:
with open(importlib.import_module(p).__file__) as f:
t_obj = ast.parse(f.read())
elif isinstance(tree, ast.Attribute) and scope is not None:
if (c:=[a for a, b in self._classes.items() if any(i.name == tree.attr for i in b)]):
self.edges.append((scope, [c[0], tree.attr]))
t_obj = tree.value
if isinstance(t_obj:=(tree if t_obj is None else t_obj), list):
for i in t_obj:
self.walk(i, scope = scope)
else:
for i in getattr(t_obj, '_fields', []):
self.walk(getattr(t_obj, i), scope=scope)
with open('sample_a.py') as f:
c = Connections()
c.walk(ast.parse(f.read()))
print(c.edges)
Output:
[(['SampleClassA', 'test_call_to_another_function'], ['SampleClassB', 'test'])]
Important note: depending on the complexity of the files you are running Connections.walk on, a RecursionError might occur. To circumvent this, here is a Gist that contains an iterative version of Connections.walk.
Creating a graph from edges:
import networkx as nx
import matplotlib.pyplot as plt
g, labels, c1 = nx.DiGraph(), {}, itertools.count(1)
for m1, m2 in c.edges:
if (p1:='.'.join(m1)) not in labels:
labels[p1] = next(c1)
if (p2:='.'.join(m2)) not in labels:
labels[p2] = next(c1)
g.add_node(labels[p1])
g.add_node(labels[p2])
g.add_edge(labels[p1], labels[p2])
nx.draw(g, pos, labels={b:a for a, b in labels.items()}, with_labels = True)
plt.show()
Output:

Identifying pure functions in python

I have a decorator #pure that registers a function as pure, for example:
#pure
def rectangle_area(a,b):
return a*b
#pure
def triangle_area(a,b,c):
return ((a+(b+c))(c-(a-b))(c+(a-b))(a+(b-c)))**0.5/4
Next, I want to identify a newly defined pure function
def house_area(a,b,c):
return rectangle_area(a,b) + triangle_area(a,b,c)
Obviously house_area is pure, since it only calls pure functions.
How can I discover all pure functions automatically (perhaps by using ast)
Assuming operators are all pure, then essentially you only need to check all the functions calls. This can indeed be done with the ast module.
First I defined the pure decorator as:
def pure(f):
f.pure = True
return f
Adding an attribute telling that it's pure, allows skipping early or "forcing" a function to identify as pure. This is useful if you'd need a function like math.sin to identify as pure. Additionally since you can't add attributes to builtin functions.
#pure
def sin(x):
return math.sin(x)
All in all. Use the ast module to visit all the nodes. Then for each Call node check whether the function being called is pure.
import ast
class PureVisitor(ast.NodeVisitor):
def __init__(self, visited):
super().__init__()
self.pure = True
self.visited = visited
def visit_Name(self, node):
return node.id
def visit_Attribute(self, node):
name = [node.attr]
child = node.value
while child is not None:
if isinstance(child, ast.Attribute):
name.append(child.attr)
child = child.value
else:
name.append(child.id)
break
name = ".".join(reversed(name))
return name
def visit_Call(self, node):
if not self.pure:
return
name = self.visit(node.func)
if name not in self.visited:
self.visited.append(name)
try:
callee = eval(name)
if not is_pure(callee, self.visited):
self.pure = False
except NameError:
self.pure = False
Then check whether the function has the pure attribute. If not get code and check if all the functions calls can be classified as pure.
import inspect, textwrap
def is_pure(f, _visited=None):
try:
return f.pure
except AttributeError:
pass
try:
code = inspect.getsource(f.__code__)
except AttributeError:
return False
code = textwrap.dedent(code)
node = compile(code, "<unknown>", "exec", ast.PyCF_ONLY_AST)
if _visited is None:
_visited = []
visitor = PureVisitor(_visited)
visitor.visit(node)
return visitor.pure
Note that print(is_pure(lambda x: math.sin(x))) doesn't work since inspect.getsource(f.__code__) returns code on a line by line basis. So the source returned by getsource would include the print and is_pure call, thus yielding False. Unless those functions are overridden.
To verify that it works, test it by doing:
print(house_area) # Prints: True
To list through all the functions in the current module:
import sys, types
for k in dir(sys.modules[__name__]):
v = globals()[k]
if isinstance(v, types.FunctionType):
print(k, is_pure(v))
The visited list keeps track of which functions have already been verified pure. This help circumvent problems related to recursion. Since the code isn't executed, the evaluation would recursively visit factorial.
#pure
def factorial(n):
return 1 if n == 1 else n * factorial(n - 1)
Note that you might need to revise the following code. Choosing another way to obtain a function from its name.
try:
callee = eval(name)
if not is_pure(callee, self.visited):
self.pure = False
except NameError:
self.pure = False

Static classes being initialised on import. How does python 2 initialise static classes on import

I am trying to introduce python 3 support for the package mime and the code is doing something I have never seen before.
There is a class Types() that is used in the package as a static class.
class Types(with_metaclass(ItemMeta, object)): # I changed this for 2-3 compatibility
type_variants = defaultdict(list)
extension_index = defaultdict(list)
# __metaclass__ = ItemMeta # unnessecary now
def __init__(self, data_version=None):
self.data_version = data_version
The type_variants defaultdict is what is getting filled in python 2 but not in 3.
It very much seems to be getting filled by this class when is in a different file called mime_types.py.
class MIMETypes(object):
_types = Types(VERSION)
def __repr__(self):
return '<MIMETypes version:%s>' % VERSION
#classmethod
def load_from_file(cls, type_file):
data = open(type_file).read()
data = data.split('\n')
mime_types = Types()
for index, line in enumerate(data):
item = line.strip()
if not item:
continue
try:
ret = TEXT_FORMAT_RE.match(item).groups()
except Exception as e:
__parsing_error(type_file, index, line, e)
(unregistered, obsolete, platform, mediatype, subtype, extensions,
encoding, urls, docs, comment) = ret
if mediatype is None:
if comment is None:
__parsing_error(type_file, index, line, RuntimeError)
continue
extensions = extensions and extensions.split(',') or []
urls = urls and urls.split(',') or []
mime_type = Type('%s/%s' % (mediatype, subtype))
mime_type.extensions = extensions
...
mime_type.url = urls
mime_types.add(mime_type) # instance of Type() is being filled?
return mime_types
The function startup() is being run whenever mime_types.py is imported and it does this.
def startup():
global STARTUP
if STARTUP:
type_files = glob(join(DIR, 'types', '*'))
type_files.sort()
for type_file in type_files:
MIMETypes.load_from_file(type_file) # class method is filling Types?
STARTUP = False
This all seems pretty weird to me. The MIMETypes class first creates an instance of Types() on the first line. _types = Types(VERSION). It then seems to do nothing with this instance and only use the mime_types instance created in the load_from_file() class method. mime_types = Types().
This sort of thing vaguely reminds me of javascript class construction. How is the instance mime_types filling Types.type_variants so that when it is imported like this.
from mime import Type, Types
The class's type_variants defaultdict can be used. And why isn't this working in python 3?
EDIT:
Adding extra code to show how type_variants is filled
(In "Types" Class)
#classmethod
def add_type_variant(cls, mime_type):
cls.type_veriants[mime_type.simplified].append(mime_type)
#classmethod
def add(cls, *types):
for mime_type in types:
if isinstance(mime_type, Types):
cls.add(*mime_type.defined_types())
else:
mts = cls.type_veriants.get(mime_type.simplified)
if mts and mime_type in mts:
Warning('Type %s already registered as a variant of %s.',
mime_type, mime_type.simplified)
cls.add_type_variant(mime_type)
cls.index_extensions(mime_type)
You can see that MIMETypes uses the add() classmethod.
Without posting more of your code, it's hard to say. I will say that I was able to get that package ported to Python 3 with only a few changes (print statement -> function, basestring -> str, adding a dot before same-package imports, and a really ugly hack to compensate for their love of cmp:
def cmp(x,y):
if isinstance(x, Type): return x.__cmp__(y)
if isinstance(y, Type): return y.__cmp__(x) * -1
return 0 if x == y else (1 if x > y else -1)
Note, I'm not even sure this is correct.
Then
import mime
print(mime.Types.type_veriants) # sic
printed out a 1590 entry defaultdict.
Regarding your question about MIMETypes._types not being used, I agree, it's not.
Regarding your question about how the dictionary is being populated, it's quite simple, and you've identified most of it.
import mime
Imports the package's __init__.py which contains the line:
from .mime_types import MIMETypes, VERSION
And mime_types.py includes the lines:
def startup():
global STARTUP
if STARTUP:
type_files = glob(join(DIR, 'types', '*'))
type_files.sort()
for type_file in type_files:
MIMETypes.load_from_file(type_file)
STARTUP = False
startup()
And MIMETypes.load_from_file() has the lines:
mime_types = Types()
#...
for ... in ...:
mime_types.add(mime_type)
And Types.add(): has the line:
cls.add_type_variant(mime_type)
And that classmethod contains:
cls.type_veriants[mime_type.simplified].append(mime_type)

How to extract functions used in a python code file?

I would like to create a list of all the functions used in a code file. For example if we have following code in a file named 'add_random.py'
`
import numpy as np
from numpy import linalg
def foo():
print np.random.rand(4) + np.random.randn(4)
print linalg.norm(np.random.rand(4))
`
I would like to extract the following list:
[numpy.random.rand, np.random.randn, np.linalg.norm, np.random.rand]
The list contains the functions used in the code with their actual name in the form of 'module.submodule.function'. Is there something built in python language that can help me do this?
You can extract all call expressions with:
import ast
class CallCollector(ast.NodeVisitor):
def __init__(self):
self.calls = []
self.current = None
def visit_Call(self, node):
# new call, trace the function expression
self.current = ''
self.visit(node.func)
self.calls.append(self.current)
self.current = None
def generic_visit(self, node):
if self.current is not None:
print "warning: {} node in function expression not supported".format(
node.__class__.__name__)
super(CallCollector, self).generic_visit(node)
# record the func expression
def visit_Name(self, node):
if self.current is None:
return
self.current += node.id
def visit_Attribute(self, node):
if self.current is None:
self.generic_visit(node)
self.visit(node.value)
self.current += '.' + node.attr
Use this with a ast parse tree:
tree = ast.parse(yoursource)
cc = CallCollector()
cc.visit(tree)
print cc.calls
Demo:
>>> tree = ast.parse('''\
... def foo():
... print np.random.rand(4) + np.random.randn(4)
... print linalg.norm(np.random.rand(4))
... ''')
>>> cc = CallCollector()
>>> cc.visit(tree)
>>> cc.calls
['np.random.rand', 'np.random.randn', 'linalg.norm']
The above walker only handles names and attributes; if you need more complex expression support, you'll have to extend this.
Note that collecting names like this is not a trivial task. Any indirection would not be handled. You could build a dictionary in your code of functions to call and dynamically swap out function objects, and static analysis like the above won't be able to track it.
In general, this problem is undecidable, consider for example getattribute(random, "random")().
If you want static analysis, the best there is now is jedi
If you accept dynamic solutions, then cover coverage is your best friend. It will show all used functions, rather than only directly referenced though.
Finally you can always roll your own dynamic instrumentation along the lines of:
import random
import logging
class Proxy(object):
def __getattr__(self, name):
logging.debug("tried to use random.%s", name)
return getattribute(_random, name)
_random = random
random = Proxy()

Python / YAML: How to initialize additional objects not just from the YAML file, within loadConfig?

I have what I think is a small misconception with loading some YAML objects. I defined the class below.
What I want to do is load some objects with the overridden loadConfig function for YAMLObjects. Some of these come from my .yaml file, but others should be built out of objects loaded from the YAML file.
For instance, in the class below, I load a member object named "keep" which is a string naming some items to keep in the region. But I want to also parse this into a list and have the list stored as a member object too. And I don't want the user to have to give both the string and list version of this parameter in the YAML.
My current work around has been to override the __getattr__ function inside Region and make it create the defaults if it looks and doesn't find them. But this is clunky and more complicated than needed for just initializing objects.
What convention am I misunderstanding here. Why doesn't the loadConfig method create additional things not found in the YAML?
import yaml, pdb
class Region(yaml.YAMLObject):
yaml_tag = u'!Region'
def __init__(self, name, keep, drop):
self.name = name
self.keep = keep
self.drop = drop
self.keep_list = self.keep.split("+")
self.drop_list = self.drop.split("+")
self.pattern = "+".join(self.keep_list) + "-" + "-".join(self.drop_list)
###
def loadConfig(self, yamlConfig):
yml = yaml.load_all(file(yamlConfig))
for data in yml:
# These get created fine
self.name = data["name"]
self.keep = data["keep"]
self.drop = data["drop"]
# These do not get created.
self.keep_list = self.keep.split("+")
self.drop_list = self.drop.split("+")
self.pattern = "+".join(self.keep_list) + "-" + "-".join(self.drop_list)
###
### End Region
if __name__ == "__main__":
my_yaml = "/home/path/to/test.yaml"
region_iterator = yaml.load_all(file(my_yaml))
# Set a debug breakpoint to play with region_iterator and
# confirm the extra stuff isn't created.
pdb.set_trace()
And here is test.yaml so you can run all of this and see what I mean:
Regions:
# Note: the string conventions below are for an
# existing system. This is a shortened, representative
# example.
Market1:
!Region
name: USAndGB
keep: US+GB
drop: !!null
Market2:
!Region
name: CanadaAndAustralia
keep: CA+AU
drop: !!null
And here, for example, is what it looks like for me when I run this in an IPython shell and explore the loaded object:
In [57]: %run "/home/espears/testWorkspace/testRegions.py"
--Return--
> /home/espears/testWorkspace/testRegions.py(38)<module>()->None
-> pdb.set_trace()
(Pdb) region_iterator
<generator object load_all at 0x1139d820>
(Pdb) tmp = region_iterator.next()
(Pdb) tmp
{'Regions': {'Market2': <__main__.Region object at 0x1f858550>, 'Market1': <__main__.Region object at 0x11a91e50>}}
(Pdb) us = tmp['Regions']['Market1']
(Pdb) us
<__main__.Region object at 0x11a91e50>
(Pdb) us.name
'USAndGB'
(Pdb) us.keep
'US+GB'
(Pdb) us.keep_list
*** AttributeError: 'Region' object has no attribute 'keep_list'
A pattern I have found useful for working with yaml for classes that are basically storage is to have the loader use the constructor so that objects are created in the same way as when you make them normally. If I understand what you are attempting to do correctly, this kind of structure might be useful:
import inspect
import yaml
from collections import OrderedDict
class Serializable(yaml.YAMLObject):
__metaclass__ = yaml.YAMLObjectMetaclass
#property
def _dict(self):
dump_dict = OrderedDict()
for var in inspect.getargspec(self.__init__).args[1:]:
if getattr(self, var, None) is not None:
item = getattr(self, var)
if isinstance(item, np.ndarray) and item.ndim == 1:
item = list(item)
dump_dict[var] = item
return dump_dict
#classmethod
def to_yaml(cls, dumper, data):
return ordered_dump(dumper, '!{0}'.format(data.__class__.__name__),
data._dict)
#classmethod
def from_yaml(cls, loader, node):
fields = loader.construct_mapping(node, deep=True)
return cls(**fields)
def ordered_dump(dumper, tag, data):
value = []
node = yaml.nodes.MappingNode(tag, value)
for key, item in data.iteritems():
node_key = dumper.represent_data(key)
node_value = dumper.represent_data(item)
value.append((node_key, node_value))
return node
You would then want to have your Region class inherit from Serializable, and remove the loadConfig stuff. The code I posted inspects the constructor to see what data to save to the yaml file, and then when loading a yaml file calls the constructor with that same set of data. That way you just have to get the logic right in your constructor and the yaml loading should get it for free.
That code was ripped from one of my projects, apologies in advance if it doesn't quite work. It is also slightly more complicated than it needs to be because I wanted to control the order of output by using OrderedDict. You could replace my ordered_dump function with a call to dumper.represent_dict.

Categories