Pass in any variable while guaranteeing predefined relationship - python

I have a simple formula taking basic arithmetic calculations given several inputs.
a = 1
b = 2
c = a + b #3=1+2
d = 4
e = c + d #7=3+4
In theory, the relationships should always hold true. And I want to write a function that user can modify any variable and the rest will be auto updated (update priority has been predefined if there are more than one alternative eg. update the most right node first).
def f():
#default state
state = {'a':1, 'b':2, 'c':3, 'd':4, 'e':7}
...
return state
f(a=0) == {'a':0, 'b':2, 'c':2, 'd':4, 'e':6}
f(c=4) == {'a':1, 'b':3, 'c':4, 'd':4, 'e':8}
f(b=2, c=4) == {'a':2, 'b':2, 'c':4, 'd':4, 'e':8}
I tried to use **kwargs and *args to allow the user to pass in any variable but have to hard code the update logic based on which variable got modified. Any better ideas?
P.S.: this example is for demonstration purpose; the real problem involves much more variables and the mathematical relationship is also more difficult (logarithm, exponential, ..)

You indeed can represent the state of the problem as a directional graph, where each edge between the variables contains an arithematic expression to use on the other node to update its state.
Than, use the BFS algorithem starting at the variable that you try to change and it will modify all of the varialbes that are linked to it and so forward.
If you have to change multiple varialbes, run the bfs multiple times, starting at each variable.
Than, if the one of the variable you ran the BFS on changes (and does not contain the wanted value), than you know the wanted state is not possible (because the variables are co-dependent).
So each time you add a state to the problem, you have to modify only the edges connected to it (and from it).

Problem Analysis
Two numbers get combined with a mathematical operators, resulting in another number
Resultant numbers depend on a relationship,
These relationships and numbers are illustrated as a tree structure.
Hence, there are two types of cells (#Copperfield):
Free cells, not depending on a relationship, dangling as leaves in the tree.
Inner cells, depending on a relationship between two cells, we will call them nodes.
Resulting tree never makes cycles.
Assumptions and Rationale
In his comments, #B.Mr.W. says, each relationship is formed by mathematical operators and there can be more than one nodes pointing to another node.
I assume he has relations like d = a - b * c. Evaluation of such expressions / relations has to follow operator precedence.
And, such expressions will be anyhow resolved to d = a - (b*c).
Evaluation of such expressions would result in sub-relationships which are again binary.
Note: These binary sub-relationships remain anonymous.
Requirements
Create new Leaf cells storing a number.
Create new Node cells by define relationships between other cells.
A Change to any cell should update all related cells, without affecting the relationship. That is, a cascading effect along the related branches.
Requirement 3. has to follow an ordering preference. The right-side node is preferred by default.
Solution Analysis
Both types of cells can be combined within or with other-type using a mathematical operator. (Implemented in class Cell)
I follow Python Data Model to solve this problem and interface style becomes:
# create new Leaf nodes with values:
a = Leaf(2)
b = Leaf(3)
c = Leaf(4)
# define relationships
d = a + b*c # (b*c) becomes anonymous
e = d - Leaf(3) # Leaf(3) is also anonymous
( ( 2 + ( 3 * 4 ) ) - 3 ) => { 'a':2, 'b':3, 'c':4, 'd':14, 'e':11 }
# get values of known nodes
a(), b(), c(), d(), e()
# update values
a(1) => { 'a':1, 'b':3, 'c':4, 'd':13, 'e':10 }
b(2) => { 'a':1, 'b':2, 'c':4, 'd':9, 'e':6 }
c(3) => { 'a':1, 'b':2, 'c':3, 'd':7, 'e':4 }
Code:
"""Super-type of Leaves and Nodes."""
class Cell:
"""Arithematic operation handlers between Leaves/Nodes"""
def __add__(self, other):
return Node(leftCell=self, rightCell=other, op='+')
def __sub__(self, other):
return Node(leftCell=self, rightCell=other, op='-')
def __mul__(self, other):
return Node(leftCell=self, rightCell=other, op='*')
def __truediv__(self, other):
return Node(leftCell=self, rightCell=other, op='/')
"""for clean presentation of float values"""
#staticmethod
def cleanFloat(val:int|float):
if isinstance(val, float):
rounded = round(val)
if abs(rounded-val)<0.011:
return rounded
else:
return round(val, 2)
return val
"""Leaves will contain values only"""
class Leaf(Cell):
def __init__(self, val:int|float):
self.val = val
"""Getter/Setter for Leaf val"""
#property
def val(self):
return self.cleanFloat(self.__val)
#val.setter
def val(self, val:int|float):
self.__val = self.cleanFloat(val)
"""Getter/Setter of Leaf object."""
def __call__(self, val:int|float=None):
if val == None:
return self.val
else:
self.val = val
def __str__(self):
return f"{self.val}"
"""Nodes contain left/right child, an arithematic operation and preferred side for update"""
class Node(Cell):
def __init__(self, leftCell:Cell, rightCell:Cell,
op:str, prefSide:str='right'):
self.leftCell = leftCell
self.rightCell = rightCell
self.op = op
self.prefSide = prefSide
"""
Preferred and the other cells for reverse path operation required during update.
These properties will help clean retrieval.
"""
#property
def preferredCell(self):
match self.prefSide:
case 'left' : return self.leftCell
case 'right': return self.rightCell
#property
def unPreferredCell(self):
match self.prefSide:
case 'left' : return self.rightCell
case 'right': return self.leftCell
"""Getter/Setter for Nodes"""
def __call__(self, val :int|float = None):
if val == None:
match self.op:
case '+':
nodeVal = self.leftCell() + self.rightCell()
case '-':
nodeVal = self.leftCell() - self.rightCell()
case '*':
nodeVal = self.leftCell() * self.rightCell()
case '/':
nodeVal = self.leftCell() / self.rightCell()
case _:
raise
return self.cleanFloat(nodeVal)
else:
match self.op:
case '+':
self.preferredCell( val - self.unPreferredCell() )
case '*':
self.preferredCell( val / self.unPreferredCell() )
case '-':
match self.prefSide:
case 'left' :
self.preferredCell( val + self.unPreferredCell() )
case 'right' :
self.preferredCell( self.unPreferredCell() - val )
case '/':
match self.prefSide:
case 'left ' :
self.preferredCell( val * self.unPreferredCell() )
case 'right' :
self.preferredCell( self.unPreferredCell() / val )
case _:
raise
def __str__(self):
return f"( {str(self.leftCell)} {self.op} {str(self.rightCell)} )"
if __name__ == '__main__':
def createTree():
# create new Leaf nodes having values
a = Leaf(2)
b = Leaf(3)
c = Leaf(4)
d = Leaf(5)
e = Leaf(6)
# define relationships
f = a + b
g = d / f - c # (d / f) becomes anonymous, higher precedence
h = Leaf(9) / e * g
# here, (Leaf(9)/e) creates the anonymous node, left to right associativity
return (a,b,c,d,e,f,g,h)
(a,b,c,d,e,f,g,h) = createTree()
def treeDict():
return f"{{ 'a':{a()}, 'b':{b()}, 'c':{c()}, 'd':{d()}, 'e':{e()}, 'f':{f()}, 'g':{g()}, 'h':{h()} }}"
print('\nget values of known cells:')
print(f"{h} => {treeDict()}\n")
print('each cell expanded (take care of anonymous cells):')
print(f"'a':{a}\n'b':{b}\n'c':{c}\n'd':{d}\n'e':{e}\n'f':{f}\n'g':{g}\n'h':{h}\n")
print('update values:')
a(1)
print( f"a(1) => {treeDict()}")
b(2)
print( f"b(2) => {treeDict()}")
c(3)
print( f"c(3) => {treeDict()}")
f(10)
print(f"f(10) => {treeDict()}")
g(10)
print(f"g(10) => {treeDict()}")
h(100)
print(f"h(100) => {treeDict()}")
print('\nchange ordering preference: g.prefSide = "left"')
(a,b,c,d,e,f,g,h) = createTree()
g.prefSide = 'left'
g(1)
print(f"g(1) => {treeDict()}")
print('\nchange ordering preference: g.prefSide = "left"')
(a,b,c,d,e,f,g,h) = createTree()
g.prefSide = 'left'
h(0)
print(f"h(0) => {treeDict()}")
print("\nAccessing Anonymous Cells:")
print(f"h.leftCell() : {h.leftCell()}")
print(f"h.leftCell.leftCell() : {h.leftCell.leftCell()}")

I wonder if this (using SymPy) gets at the desired behavior:
from sympy.abc import *
from sympy import *
ed = Tuple(
(Eq(d, a+b+c),c),
(Eq(g, e+f),f),
(Eq(h,d+g),g))
def indep(ed):
free = set()
dep = set()
for i,_ in ed:
assert i.lhs.is_Symbol
free |=i.rhs.free_symbols
dep |= {i.lhs}
return free - dep
def F(x, v, eq):
eq = list(eq)
free = indep(eq)
if x in free:
# leaf
return [i[0] for i in eq], free
else:
for i,y in eq:
if i.lhs == x:
return [i[0] for i in eq] + [Eq(x, v)], free - {y}
assert None, 'unknown parameter'
def update(x, v, ed):
eqs, ex = F(x, v, ed)
sol = solve(eqs, exclude=ex, dict=True)
assert len(sol) == 1
return Dict(sol), list(ordered(ex))
ed represents the list of equations and the defined "dependent/rightmost" variable for each equation, however you want to define it.
So if you want to update g = 10 you would have the following values:
>>> update(g, 10, ed)
({d: a + b + c, f: 10 - e, g: 10, h: a + b + c + 10}, [a, b, c, e])
The variables on the right are the ones that you would have to specify in order to get known values for all others
>>> _[0].subs(dict(zip(_[1],(1,1,1,1))))
{d: 3, f: 9, g: 10, h: 13}

Related

Optimize sympy expression evaluation by combining as many free symbols as possible

Imagine I have an unknown, quite complex expression that I need to repeatedly evaluate numerically, e.g.:
my_expr = (a*b*c**2 - 2*sqrt(d*(a*b-c-e+x)))/(b - 1)
Each time I reevaluate the expression, the only symbol that changes is 'x', so it makes sense for me to precompute all the others (I will be using c code generation eventually).
So what I want is to automatically pull out and combine as many free symbols as possible in advance, except for x. This would work a bit like cse, but making the final expression contain as few calculations as possible.
e.g. for the above I might end up with a system equivalent to this:
var1 = a*b*c**2
var2 = a*b-c-e
var3 = b - 1
my_new_expr = (var1-2*sqrt(d*(var2+x)))/var3
This means I can precalculate var1,var2 & var3, and the repeated calculation (my_new_expr) is as simple as possible computationally.
Is there anyway I can do this in sympy? I've looked through all the simplification methods etc, including collect etc, and none quite do what I need. Failing that, is there any traversal of the expression I could do to achieve this?
Although my model branch at sympy/smichr has a more comprehensive solution, the following will do pretty well at condensing those sub-expressions that are constant:
def condense(eq, *x):
"""collapse additive/multiplicative constants into single
variables, returning condensed expression and replacement
values.
Examples
========
Simple constants are left unchanged
>>> condense(2*x + 2, x)
(2*x + 2, {})
More complex constants are replaced by a single variable
>>> first = condense(eq, x); first
(c6*(c5 - 2*sqrt(d*(c4 + x))), {c4: a*b - c - e, c6: 1/(b - 1), c5: a*b*c**2})
If a condensed expression is expanded, there may be more simplification possible:
>>> second = condense(first[0].expand(), x); second
(c0 + c2*sqrt(c1 + d*x), {c1: c4*d, c2: -2*c6, c0: c5*c6})
>>> full_reps = {k: v.xreplace(first[1]) for k, v in second[1].items()}; full_reps
{c1: d*(a*b - c - e), c2: -2/(b - 1), c0: a*b*c**2/(b - 1)}
More than 1 variable can be designated:
>>> condense(eq, c, e)
(c4*(c**2*c1 - 2*sqrt(d*(-c + c2 - e))), {c4: 1/(b - 1), c1: a*b, c2: a*b + x})
"""
reps = {}
con = numbered_symbols('c')
free = eq.free_symbols
def c():
while True:
rv = next(con)
if rv not in free:
return rv
def do(e):
i, d = e.as_independent(*x)
if not i.args: return e
return e.func(reps.get(i, reps.setdefault(i, c())), d)
rv = eq.replace(lambda x: x.is_Add or x.is_Mul, lambda x: do(x))
reps = {v: k for k, v in reps.items()}
keep = rv.free_symbols & set(reps)
reps = {k: reps[k].xreplace(reps) for k in keep}
return rv, reps

Create a list of unique numbers by applying transitive closure

I have a list of tuples (each tuple consists of 2 numbers) like:
array = [(1, 2), (1, 3), (2, 4), (5, 8), (8, 10)]
Lets say, these numbers are ids of some db objects (records) and inside a tuple, there are ids of duplicate objects. Which means 1 and 2 are duplicate. 1 and 3 are duplicate which means 2 and 3 are also duplicate.
if a == b and b == c then a == c
Now I want to merge all these duplicate objects ids into a single tuple like this:
output = [(1, 2, 3, 4), (5, 8, 10)]
I know I can do this using loops and redundant matches. I just want some better solution with low processing / calculations (if there is any).
You can use a data structure making it more efficient to perform a merge. Here you create some sort of opposite tree. So in your example you first would create the numbers listed:
1 2 3 4 5 8 10
Now if you iterate over the (1,2) tuple, you look up 1 and 2 in some sort of dictionary. You search their ancestors (there are none here) and then you create some sort of merge node:
1 2 3 4 5 8 10
\/
12
Next we merge (1,3) so we look up the ancestor of 1 (12) and 3 (3) and perform another merge:
1 2 3 4 5 8 10
\/ |
12 /
\/
123
Next we merge (2,4) and (5,8) and (8,10):
1 2 3 4 5 8 10
\/ | | \/ |
12 / | 58 /
\/ / \/
123 / 5810
\/
1234
You also keep a list of the "merge-heads" so you can easily return the elements.
Time to get our hands dirty
So now that we know how to construct such a datastructure, let's implement one. First we define a node:
class Merge:
def __init__(self,value=None,parent=None,subs=()):
self.value = value
self.parent = parent
self.subs = subs
def get_ancestor(self):
cur = self
while cur.parent is not None:
cur = cur.parent
return cur
def __iter__(self):
if self.value is not None:
yield self.value
elif self.subs:
for sub in self.subs:
for val in sub:
yield val
Now we first initialize a dictionary for every element in your list:
vals = set(x for tup in array for x in tup)
and create a dictionary for every element in vals that maps to a Merge:
dic = {val:Merge(val) for val in vals}
and the merge_heads:
merge_heads = set(dic.values())
Now for each tuple in the array, we lookup the corresponding Merge object that is the ancestor, we create a new Merge on top of that, remove the two old heads from the merge_head set and add the new merge to it:
for frm,to in array:
mra = dic[frm].get_ancestor()
mrb = dic[to].get_ancestor()
mr = Merge(subs=(mra,mrb))
mra.parent = mr
mrb.parent = mr
merge_heads.remove(mra)
merge_heads.remove(mrb)
merge_heads.add(mr)
Finally after we have done that we can simply construct a set for each Merge in merge_heads:
resulting_sets = [set(merge) for merge in merge_heads]
and resulting_sets will be (order may vary):
[{1, 2, 3, 4}, {8, 10, 5}]
Putting it all together (without class definition):
vals = set(x for tup in array for x in tup)
dic = {val:Merge(val) for val in vals}
merge_heads = set(dic.values())
for frm,to in array:
mra = dic[frm].get_ancestor()
mrb = dic[to].get_ancestor()
mr = Merge(subs=(mra,mrb))
mra.parent = mr
mrb.parent = mr
merge_heads.remove(mra)
merge_heads.remove(mrb)
merge_heads.add(mr)
resulting_sets = [set(merge) for merge in merge_heads]
This will worst case run in O(n2), but you can balance the tree such that the ancestor is found in O(log n) instead, making it O(n log n). Furthermore you can short-circuit the list of ancestors, making it even faster.
You can use disjoint set.
Disjoint set is actually a kind of tree structure. Let's consider each number as a tree node, and every time we read in an edge (u, v), we just easily associate the two trees u and v in (if it does not exist, create an one-node tree instead) by pointing the root node of one tree to another's. At the end, we should just walk through the forest to get the result.
from collections import defaultdict
def relation(array):
mapping = {}
def parent(u):
if mapping[u] == u:
return u
mapping[u] = parent(mapping[u])
return mapping[u]
for u, v in array:
if u not in mapping:
mapping[u] = u
if v not in mapping:
mapping[v] = v
mapping[parent(u)] = parent(v)
results = defaultdict(set)
for u in mapping.keys():
results[parent(u)].add(u)
return [tuple(x) for x in results.values()]
In the code above, mapping[u] stores the ancestor of node u (parent or root). Specially, the ancestor of one-node tree's node is itself.
See my comment on Moinuddin's answer : this accepted answer does not validates the tests that you can found at http://rosettacode.org/wiki/Set_consolidation#Python . I did not dig it up though.
I would make a new proposition, based on Willem's answer.
The problem in this proposition is the recursivity in the get_ancestor calls : why should we climb up the tree each time we are asked our ancestor, when we could just remember the last root found (and still climb up from that point in case it changed). Indeed, Willem's algorithm is not linear (something like nlogn or n²) while we could remove this non-linearity just as easily.
Another problem comes from the iterator : if the tree is too deep (I had the problem in my use case), you get a Python Exception (Too much recursion) inside the iterator. So instead of building a full tree, we should merge sub leafs (and instead of having branches with 2 leafs, we build branches with N leafs).
My version of the code is as follow :
class Merge:
def __init__(self,value=None,parent=None,subs=None):
self.value = value
self.parent = parent
self.subs = subs
self.root = None
if self.subs:
subs_a,subs_b = self.subs
if subs_a.subs:
subs_a = subs_a.subs
else:
subs_a = [subs_a]
if subs_b.subs:
subs_b = subs_b.subs
else:
subs_b = [subs_b]
self.subs = subs_a+subs_b
for s in self.subs:
s.parent = self
s.root = None
def get_ancestor(self):
cur = self if self.root is None else self.root
while cur.parent is not None:
cur = cur.parent
if cur != self:
self.root = cur
return cur
def __iter__(self):
if self.value is not None:
yield self.value
elif self.subs:
for sub in self.subs:
for val in sub:
yield val
def treeconsolidate(array):
vals = set(x for tup in array for x in tup)
dic = {val:Merge(val) for val in vals}
merge_heads = set(dic.values())
for settomerge in array:
frm = settomerge.pop()
for to in settomerge:
mra = dic[frm].get_ancestor()
mrb = dic[to].get_ancestor()
if mra == mrb:
continue
mr = Merge(subs=[mra,mrb])
merge_heads.remove(mra)
merge_heads.remove(mrb)
merge_heads.add(mr)
resulting_sets = [set(merge) for merge in merge_heads]
return resulting_sets
In small merges, this will not change many things but my experience shows that climbing up the tree in huge sets of many elements can cost a lot : in my case, I have to deal with 100k sets, each of them containing between 2 and 1000 elements, and each element may appear in 1 to 1000 sets...
I think the most efficient way to achieve this will be using set as:
def transitive_cloure(array):
new_list = [set(array.pop(0))] # initialize first set with value of index `0`
for item in array:
for i, s in enumerate(new_list):
if any(x in s for x in item):
new_list[i] = new_list[i].union(item)
break
else:
new_list.append(set(item))
return new_list
Sample run:
>>> transitive_cloure([(1,2), (1,3), (2,4), (5,8), (8,10)])
[{1, 2, 3, 4}, {8, 10, 5}]
Comparison with other answers (on Python 3.4):
This answer: 6.238126921001822
>>> timeit.timeit("moin()", setup="from __main__ import moin")
6.238126921001822
Willem's solution: 29.115453064994654 (Time related to declaration of class is excluded)
>>> timeit.timeit("willem()", setup="from __main__ import willem")
29.115453064994654
hsfzxjy's solution: 10.049749890022213
>>> timeit.timeit("hsfzxjy()", setup="from __main__ import hsfzxjy")
10.049749890022213

Parsing an equation with custom functions in Python

I have a string that is a mathematical equation, but with some custom functions. I need to find all such functions and replace them with some code.
For example, I have a string:
a+b+f1(f2(x,y),x)
I want code that will replace (say) f2(x,y) with x+y^2 and f1(x,y) with sin(x+y).
It would be ideal if nested functions were supported, like in the example. However, it would still be useful if nesting was not supported.
As I understand from similar topics this can be done using a compiler module like compiler.parse(eq). How I can work with AST object created by compiler.parse(eq) to reconstruct my string back, replacing all found functions?
I need only to perform substitution and then string will be used in other program. Evaluation is not needed.
Here is a minimal working example (+, - , *, /, ** binary and unary operations and function call implemented). The priority of operations are set with parenthesis.
A little bit more than the functionality for the example given is done:
from __future__ import print_function
import ast
def transform(eq,functions):
class EqVisitor(ast.NodeVisitor):
def visit_BinOp(self,node):
#generate("=>BinOp")
generate("(")
self.visit(node.left)
self.visit(node.op)
#generate("ici",str(node.op),node._fields,node._attributes)
#generate(dir(node.op))
self.visit(node.right)
generate(")")
#ast.NodeVisitor.generic_visit(self,node)
def visit_USub(self,node):
generate("-")
def visit_UAdd(self,node):
generate("+")
def visit_Sub(self,node):
generate("-")
def visit_Add(self,node):
generate("+")
def visit_Pow(self,node):
generate("**")
def visit_Mult(self,node):
generate("*")
def visit_Div(self,node):
generate("/")
def visit_Name(self,node):
generate(node.id)
def visit_Call(self,node):
debug("function",node.func.id)
if node.func.id in functions:
debug("defined function")
func_visit(functions[node.func.id],node.args)
return
debug("not defined function",node.func.id)
#generate(node._fields)
#generate("args")
generate(node.func.id)
generate("(")
sep = ""
for arg in node.args:
generate (sep)
self.visit(arg)
sep=","
generate(")")
def visit_Num(self,node):
generate(node.n)
def generic_visit(self, node):
debug ("\n",type(node).__name__)
debug (node._fields)
ast.NodeVisitor.generic_visit(self, node)
def func_visit(definition,concrete_args):
class FuncVisitor(EqVisitor):
def visit_arguments(self,node):
#generate("visit arguments")
#generate(node._fields)
self.arguments={}
for concrete_arg,formal_arg in zip(concrete_args,node.args):
#generate(formal_arg._fields)
self.arguments[formal_arg.id]=concrete_arg
debug(self.arguments)
def visit_Name(self,node):
debug("visit Name",node.id)
if node.id in self.arguments:
eqV.visit(self.arguments[node.id])
else:
generate(node.id)
funcV=FuncVisitor()
funcV.visit(ast.parse(definition))
eqV=EqVisitor()
result = []
def generate(s):
#following line maybe usefull for debug
debug(str(s))
result.append(str(s))
eqV.visit(ast.parse(eq,mode="eval"))
return "".join(result)
def debug(*args,**kwargs):
#print(*args,**kwargs)
pass
Usage:
functions= {
"f1":"def f1(x,y):return x+y**2",
"f2":"def f2(x,y):return sin(x+y)",
}
eq="-(a+b)+f1(f2(+x,y),z)*4/365.12-h"
print(transform(eq,functions))
Result
((-(a+b)+(((sin((+x+y))+(z**2))*4)/365.12))-h)
WARNING
The code works with Python 2.7 and as it is AST dependent is not guaranteed to work with another version of Python. The Python 3 version doesn't work.
The full substitution is quite tricky. Here is my attempt to do it. Here we can successfully inline expressions,
but not in all scenarios. This code works on AST only, made by ast module. And uses codegen to stringify it back to code. The stringifying of ast and modifying ast in general is covered in other SO Q/A: "Parse a .py file, read the AST, modify it, then write back the modified source code".
First we define few helpers:
import ast
import codegen
import copy
def parseExpr(expr):
# Strip:
# Module(body=[Expr(value=
return ast.parse(expr).body[0].value
def toSource(expr):
return codegen.to_source(expr)
After that we define a substitution function using NodeTransformer.
For example:
substitute(parseExpr("a + b"), { "a": parseExpr("1") }) # 1 + b
The simulatenous substitution of multiple variables is needed to properly avoid nasty situations.
For example substituting both a and b for a + b in a + b.
The result should be (a + b) + (a + b), but if we substitute first a for a + b, we'll get (a + b) + b, and then substitute b, we'll get (a + (a + b)) + b which is the wrong result! So simultaneous is important:
class NameTransformer(ast.NodeTransformer):
def __init__(self, names):
self.names = names
def visit_Name(self, node):
if node.id in self.names:
return self.names[node.id]
else:
return node
def substitute(expr, names):
print "substitute"
for varName, varValue in names.iteritems():
print " name " + varName + " for " + toSource(varValue)
print " in " + toSource(expr)
return NameTransformer(names).visit(expr)
Then we write similar NodeTransformer to find calls, where we can inline function definitions:
class CallTransformer(ast.NodeTransformer):
def __init__(self, fnName, varNames, fnExpr):
self.fnName = fnName
self.varNames = varNames
# substitute in new fn expr for each CallTransformer
self.fnExpr = copy.deepcopy(fnExpr)
self.modified = False
def visit_Call(self, node):
if (node.func.id == self.fnName):
if len(node.args) == len(self.varNames):
print "expand call to " + self.fnName + "(" + (", ".join(self.varNames)) + ")" + " with arguments "+ ", ".join(map(toSource, node.args))
# We substitute in args too!
old_node = node
args = map(self.visit, node.args)
names = dict(zip(self.varNames, args))
node = substitute(self.fnExpr, names)
self.modified = True
return node
else:
raise Exception("invalid arity " + toSource(node))
else:
return self.generic_visit(node)
def substituteCalls(expr, definitions, n = 3):
while True:
if (n <= 0):
break
n -= 1
modified = False
for fnName, varNames, fnExpr in definitions:
transformer = CallTransformer(fnName, varNames, fnExpr)
expr = transformer.visit(expr)
modified = modified or transformer.modified
if not modified:
break
return expr
The substituteCalls is recursive so we can inline recursive functions too. Also there is an explicit limit, because some definitions might be infinitely recursive (as fact below). There is a bit of ugly looking copying, but it is required to separate different subtrees.
And the example code:
if True:
print "f1 first, unique variable names"
ex = parseExpr("a+b+f1(f2(x, y), x)")
ex = substituteCalls(ex, [
("f1", ["u", "v"], parseExpr("sin(u + v)")),
("f2", ["i", "j"], parseExpr("i + j ^ 2"))])
print toSource(ex)
print "---"
if True:
print "f1 first"
ex = parseExpr("a+b+f1(f2(x, y), x)")
ex = substituteCalls(ex, [
("f1", ["x", "y"], parseExpr("sin(x + y)")),
("f2", ["x", "y"], parseExpr("x + y ^ 2"))])
print toSource(ex)
print "---"
if True:
print "f2 first"
ex = parseExpr("f1(f1(x, x), y)")
ex = substituteCalls(ex, [
("f1", ["x", "y"], parseExpr("x + y"))])
print toSource(ex)
print "---"
if True:
print "fact"
ex = parseExpr("fact(n)")
ex = substituteCalls(ex, [
("fact", ["n"], parseExpr("n if n == 0 else n * fact(n-1)"))])
print toSource(ex)
print "---"
Which prints out:
f1 first, unique variable names
expand call to f1(u, v) with arguments f2(x, y), x
substitute
name u for f2(x, y)
name v for x
in sin((u + v))
expand call to f2(i, j) with arguments x, y
substitute
name i for x
name j for y
in ((i + j) ^ 2)
((a + b) + sin((((x + y) ^ 2) + x)))
---
f1 first
expand call to f1(x, y) with arguments f2(x, y), x
substitute
name y for x
name x for f2(x, y)
in sin((x + y))
expand call to f2(x, y) with arguments x, y
substitute
name y for y
name x for x
in ((x + y) ^ 2)
((a + b) + sin((((x + y) ^ 2) + x)))
---
f2 first
expand call to f1(x, y) with arguments f1(x, x), y
expand call to f1(x, y) with arguments x, x
substitute
name y for x
name x for x
in (x + y)
substitute
name y for y
name x for (x + x)
in (x + x)
((x + x) + ((x + x) + x))
---
fact
expand call to fact(n) with arguments n
substitute
name n for n
in n if (n == 0) else (n * fact((n - 1)))
expand call to fact(n) with arguments (n - 1)
substitute
name n for (n - 1)
in n if (n == 0) else (n * fact((n - 1)))
expand call to fact(n) with arguments ((n - 1) - 1)
substitute
name n for ((n - 1) - 1)
in n if (n == 0) else (n * fact((n - 1)))
n if (n == 0) else (n * (n - 1) if ((n - 1) == 0) else ((n - 1) * ((n - 1) - 1) if (((n - 1) - 1) == 0) else (((n - 1) - 1) * fact((((n - 1) - 1) - 1)))))
Unfortunately codegen version in pypi is buggy. It doesn't parenthesise expressions properly, even AST says they should. I used jbremer/codegen (pip install git+git://github.com/jbremer/codegen). It adds unnecessary parenthesis too, but it's better than no at all. Thanks to #XavierCombelle for the tip.
The substitution gets trickier if you have anonymous functions, i.e lambda. Then you need to rename variables. You could try to search for lambda calculus with substitution or implementation. Yet I had bad luck to find any articles which use Python for the task.
Do you know the variables beforehand?
I recommend using SymPy!
Take for example the following:
import sympy
a,b,x,y = sympy.symbols('a b x y')
f1 = sympy.Function('f1')
f2 = sympy.Function('f2')
readString = "a+b+f1(f2(x,y),x)"
z = eval(readString)
'z' will now be a symbolic term representing the mathematical formula. You can print it out. You can then use subs to replace symbolic terms or functions. You can either represent sine symbolically again (like f1 and f2) or you can possibly use the sin() in sympy.mpmath.
Depending on your needs, this approach is great because you can eventually compute, evaluate or simplify this expression.
What is your long term goal? Is it to evaluate the function or simply perform substitution? In the former case you can simply try this (note that f1 and f2 could also be dynamically defined):
import math
math.sin
def f2(x, y):
return x + y ** 2
def f1(x, y):
return math.sin(x + y)
a, b = 1, 2
x, y = 3, 4
eval('a + b + f1(f2(x, y), x)')
# 2.991148690709596
If you want to replace the functions and get back the modified version, you will indeed have to resort to some sort of AST parser. Be careful though with the use of eval, as this opens up a security hole for malicious user input code.
(Using sympy as adrianX suggested, with some extra code.)
Code below converts a given string to a new string after combining given functions. It's hasty and poorly documented, but it works.
WARNING!
Contains exec eval, malicious code could probably have an effected, if input is provided by external users.
UPDATE:
Rewrote the whole code. Works in Python 2.7.
Function arguments can be separated by comma or whitespace or both.
All examples in question and comments are working.
import re
import sympy
##################################################
# Input string and functions
initial_str = 'a1+myf1(myf2(a, b),y)'
given_functions = {'myf1(x,y)': 'cross(x,y)', 'myf2(a, b)': 'value(a,b)'}
##################################################
print '\nEXECUTED/EVALUATED STUFF:\n'
processed_str = initial_str
def fixed_power_op(str_to_fix):
return str_to_fix.replace('^', '**')
def fixed_multiplication(str_to_fix):
"""
Inserts multiplication symbol wherever omitted.
"""
pattern_digit_x = r"(\d)([A-Za-z])" # 4x -> 4*x
pattern_par_digit = r"(\))(\d)" # )4 -> )*4
pattern_digit_par = r"[^a-zA-Z]?_?(\d)(\()" # 4( -> 4*(
for patt in (pattern_digit_x, pattern_par_digit, pattern_digit_par):
str_to_fix = re.sub(patt, r'\1*\2', str_to_fix)
return str_to_fix
processed_str = fixed_power_op(processed_str)
class FProcessing(object):
def __init__(self, func_key, func_body):
self.func_key = func_key
self.func_body = func_body
def sliced_func_name(self):
return re.sub(r'(.+)\(.+', r'\1', self.func_key)
def sliced_func_args(self):
return re.search(r'\((.*)\)', self.func_key).group()
def sliced_args(self):
"""
Returns arguments found for given function. Arguments can be separated by comma or whitespace.
:returns (list)
"""
if ',' in self.sliced_func_args():
arg_separator = ','
else:
arg_separator = ' '
return self.sliced_func_args().replace('(', '').replace(')', '').split(arg_separator)
def num_of_sliced_args(self):
"""
Returns number of arguments found for given function.
"""
return len(self.sliced_args())
def functions_in_function_body(self):
"""
Detects functions in function body.
e.g. f1(x,y): sin(x+y**2), will result in "sin"
:returns (set)
"""
return set(re.findall(r'([a-zA-Z]+_?\w*)\(', self.func_body))
def symbols_in_func_body(self):
"""
Detects non argument symbols in function body.
"""
symbols_in_body = set(re.findall(r'[a-zA-Z]+_\w*', self.func_body))
return symbols_in_body - self.functions_in_function_body()
# --------------------------------------------------------------------------------------
# SYMBOL DETECTION (x, y, z, mz,..)
# Prohibited symbols
prohibited_symbol_names = set()
# Custom function names are prohibited symbol names.
for key in given_functions.keys():
prohibited_symbol_names |= {FProcessing(func_key=key, func_body=None).sliced_func_name()}
def symbols_in_str(provided_str):
"""
Returns a set of symbol names that are contained in provided string.
Allowed symbols start with a letter followed by 0 or more letters,
and then 0 or more numbers (eg. x, x1, Na, Xaa_sd, xa123)
"""
symbol_pattern = re.compile(r'[A-Za-z]+\d*')
symbol_name_set = re.findall(symbol_pattern, provided_str)
# Filters out prohibited.
symbol_name_set = {i for i in symbol_name_set if (i not in prohibited_symbol_names)}
return symbol_name_set
# ----------------------------------------------------------------
# EXEC SYMBOLS
symbols_in_given_str = symbols_in_str(initial_str)
# e.g. " x, y, sd = sympy.symbols('x y sd') "
symbol_string_to_exec = ', '.join(symbols_in_given_str)
symbol_string_to_exec += ' = '
symbol_string_to_exec += "sympy.symbols('%s')" % ' '.join(symbols_in_given_str)
exec symbol_string_to_exec
# -----------------------------------------------------------------------------------------
# FUNCTIONS
# Detects secondary functions (functions contained in body of given_functions dict)
sec_functions = set()
for key, val in given_functions.items():
sec_functions |= FProcessing(func_key=key, func_body=val).functions_in_function_body()
def secondary_function_as_exec_str(func_key):
"""
Used for functions that are contained in the function body of given_functions.
E.g. given_functions = {f1(x): sin(4+x)}
"my_f1 = sympy.Function('sin')(x)"
:param func_key: (str)
:return: (str)
"""
returned_str = "%s = sympy.Function('%s')" % (func_key, func_key)
print returned_str
return returned_str
def given_function_as_sympy_class_as_str(func_key, func_body):
"""
Converts given_function to sympy class and executes it.
E.g. class f1(sympy.Function):
nargs = (1, 2)
#classmethod
def eval(cls, x, y):
return cross(x+y**2)
:param func_key: (str)
:return: (None)
"""
func_proc_instance = FProcessing(func_key=func_key, func_body=func_body)
returned_str = 'class %s(sympy.Function): ' % func_proc_instance.sliced_func_name()
returned_str += '\n\tnargs = %s' % func_proc_instance.num_of_sliced_args()
returned_str += '\n\t#classmethod'
returned_str += '\n\tdef eval(cls, %s):' % ','.join(func_proc_instance.sliced_args())
returned_str = returned_str.replace("'", '')
returned_str += '\n\t\treturn %s' % func_body
returned_str = fixed_power_op(returned_str)
print '\n', returned_str
return returned_str
# Executes functions in given_functions' body
for name in sec_functions:
exec secondary_function_as_exec_str(func_key=name)
# Executes given_functions
for key, val in given_functions.items():
exec given_function_as_sympy_class_as_str(func_key=key, func_body=val)
final_result = eval(initial_str)
# PRINTING
print '\n' + ('-'*40)
print '\nRESULTS'
print '\nInitial string: \n%s' % initial_str
print '\nGiven functions:'
for key, val in given_functions.iteritems():
print '%s: ' % key, val
print '\nResult: \n%s' % final_result
I think you want to use something like PyBison which is a parser generator.
See an example that contains the basic code you need here:
http://freenet.mcnabhosting.com/python/pybison/calc.py
You need to add a token type for functions, and a rule for functions, and then what happens with that function if it is encountered.
If you need other information about parsing and so on, try to read some basic tutorials on Lex and (Yacc or Bison).

Python -- using regex on a class instance

I have a class that was taking in lists of 1's and 0's and performing GF(2) finite field arithmetic operations. It used to work until I tried to make it take the input in polynomial format. As for how the finite arithmetic will be done after fixing the regex issue, I was thinking about overloading the operators.
The actual code in parsePolyToListInput(input) works when outside the class. The problem seems to be in the regex, which errors that it will only take in a string (this makes sense), but does not seem to initialize with self.expr as a parameter (that's a problem). The #staticmethod just before the initialization was an attempt to salvage the unbound error as it the polynomial was passed in, but this is apparently completely wrong. Just to save you time if you decide to look at any of the arithmetic operations, modular inverse does not work (seems to be due to the formatting issue after every iteration of that while loop for division in the function and what the return type is):
import re
class gf2poly:
#binary arithemtic on polynomials
##staticmethod
def __init__(self,expr):
self.expr = expr
#self.expr = [int(i) for i in expr]
self.expr = gf2poly.parsePolyToListInput(self.expr)
def convert(self): #to clarify the input if necessary
convertToString = str(self.expr)
print "expression is %s"%(convertToString)
def id(self): #returns modulus 2 (1,0,0,1,1,....) for input lists
return [int(self.expr[i])%2 for i in range(len(self.expr))]
def listToInt(self): #converts list to integer for later use
result = gf2poly.id(self)
return int(''.join(map(str,result)))
def prepBinary(a,b): #converts to base 2 and orders min and max for use
a = gf2poly.listToInt(a); b = gf2poly.listToInt(b)
bina = int(str(a),2); binb = int(str(b),2)
a = min(bina,binb); b = max(bina,binb);
return a,b
#staticmethod
def outFormat(raw):
raw = str(raw[::-1]); g = [] #reverse binary string for enumeration
[g.append(i) for i,c in enumerate(raw) if c == '1']
processed = "x**"+' + x**'.join(map(str, g[::-1]))
if len(g) == 0: return 0 #return 0 if list empty
return processed #returns result in gf(2) polynomial form
def parsePolyToListInput(poly):
c = [int(i.group(0)) for i in re.finditer(r'\d+', poly)] #re.finditer returns an iterator
#m = max(c)
return [1 if x in c else 0 for x in xrange(max(c), -1, -1)]
#return d
def add(self,other): #accepts 2 lists as parameters
a = gf2poly.listToInt(self); b = gf2poly.listToInt(other)
bina = int(str(a),2); binb = int(str(b),2)
m = bina^binb; z = "{0:b}".format(m)
return z #returns binary string
def subtract(self,other): #basically same as add() but built differently
result = [self.expr[i] ^ other.expr[i] for i in range(len(max(self.expr,other.expr)))]
return int(''.join(map(str,result)))
def multiply(a,b): #a,b are lists like (1,0,1,0,0,1,....)
a,b = gf2poly.prepBinary(a,b)
g = []; bitsa = "{0:b}".format(a)
[g.append((b<<i)*int(bit)) for i,bit in enumerate(bitsa)]
m = reduce(lambda x,y: x^y,g); z = "{0:b}".format(m)
return z #returns product of 2 polynomials in gf2
def divide(a,b): #a,b are lists like (1,0,1,0,0,1,....)
a,b = gf2poly.prepBinary(a,b)
bitsa = "{0:b}".format(a); bitsb = "{0:b}".format(b)
difflen = len(str(bitsb)) - len(str(bitsa))
c = a<<difflen; q=0
while difflen >= 0 and b != 0: #a is divisor, b is dividend, b/a
q+=1<<difflen; b = b^c # b/a because of sorting in prep
lendif = abs(len(str(bin(b))) - len(str(bin(c))))
c = c>>lendif; difflen -= lendif
r = "{0:b}".format(b); q = "{0:b}".format(q)
return r,q #returns r remainder and q quotient in gf2 division
def remainder(a,b): #separate function for clarity when calling
r = gf2poly.divide(a,b)[0]; r = int(str(r),2)
return "{0:b}".format(r)
def quotient(a,b): #separate function for clarity when calling
q = gf2poly.divide(a,b)[1]; q = int(str(q),2)
return "{0:b}".format(q)
def extendedEuclideanGF2(a,b): # extended euclidean. a,b are GF(2) polynomials in list form
inita,initb=a,b; x,prevx=0,1; y,prevy = 1,0
while sum(b) != 0:
q = gf2poly.quotient(a,b);
q = list(q); q = [int(x) for x in q]
#q = list(q);
#q = tuple([int(i) for i in q])
q = gf2poly(q)
a,b = b,gf2poly.remainder(a,b);
#a = map(list, a);
#b = [list(x) for x in a];
#a = [int(x) for x in a]; b = [int(x) for x in b];
b = list(b); b = [int(x) for x in b]
#b = list(b);
#b = tuple([int(i) for i in b])
b = gf2poly(b)
#x,prevx = (prevx-q*x, x);
#y,prevy=(prevy-q*y, y)
print "types ",type(q),type(a),type(b)
#q=a//b; a,b = b,a%b; x,prevx = (prevx-q*x, x); y,prevy=(prevy-q*y, y)
#print("%d * %d + %d * %d = %d" % (inita,prevx,initb,prevy,a))
return a,prevx,prevy # returns gcd of (a,b), and factors s and t
def modular_inverse(a,mod): # where a,mod are GF(2) polynomials in list form
gcd,s,t = gf2poly.extendedEuclideanGF2(a,mod); mi = gf2poly.remainder(s,mod)
#gcd,s,t = ext_euc_alg_i(a,mod); mi = s%mod
if gcd !=1: return False
#print ("%d * %d mod %d = 1"%(a,mi,mod))
return mi # returns modular inverse of a,mod
I usually test it with this input:
a = x**14 + x**1 + x**0
p1 = gf2poly(a)
b = x**6 + x**2 + x**1
p2 = gf2poly(b)
The first thing you might notice about my code is that it's not very good. There are 2 reasons for that:
1) I wrote it so that the 1st version could do work in the finite field GF(2), and output in polynomial format. Then the next versions were supposed to be able to take polynomial inputs, and also perform the crucial 'modular inverse' function which is not working as planned (this means it's actually not working at all).
2) I'm teaching myself Python (I'm actually teaching myself programming overall), so any constructive criticism from pro Python programmers is welcome as I'm trying to break myself of beginner habits as quickly as possible.
EDIT:
Maybe some more of the code I've been testing with will help clarify what works and what doesn't:
t1 = [1,1,1]; t2 = [1,0,1]; t3 = [1,1]; t4 = [1, 0, 1, 1, 1, 1, 1]
t5 = [1,1,1,1]; t6 = [1,1,0,1]; t7 = [1,0,1,1,0]
f1 = gf2poly(t1); f2 = gf2poly(t2); f3 = gf2poly(t3); f4 = gf2poly(t4)
f5 = gf2poly(t5);f6 = gf2poly(t6);f7 = gf2poly(t7)
##print "subtract: ",a.subtract(b)
##print "add: ",a.add(b)
##print "multiply: ",gf2poly.multiply(f1,f3)
##print "multiply: ",gf2poly.multiply(f1,f2)
##print "multiply: ",gf2poly.multiply(f3,f4)
##print "degree a: ",a.degree()
##print "degree c: ",c.degree()
##print "divide: ",gf2poly.divide(f1,b)
##print "divide: ",gf2poly.divide(f4,a)
##print "divide: ",gf2poly.divide(f4,f2)
##print "divide: ",gf2poly.divide(f2,a)
##print "***********************************"
##print "quotient: ",gf2poly.quotient(f2,f5)
##print "remainder: ",gf2poly.remainder(f2,f5)
##testq = gf2poly.quotient(f4,f2)
##testr = gf2poly.remainder(f4,f2)
##print "quotient: ",testq,type(testq)
##print "remainder: ",testr,type(testr)
##print "***********************************"
##print "outFormat testp: ",gf2poly.outFormat(testq)
##print "outFormat testr: ",gf2poly.outFormat(testr)
##print "***********************************"
#print "gf2poly.modular_inverse(): ",gf2poly.modular_inverse(f2,f3)
print "p1 ",p1 #,type(f2),type(f3)
#print "parsePolyToListInput ",gf2poly.parsePolyToListInput(a)
Part of your problem is that you haven't declared self as an argument for parsePolyToListInput. When you call a method, the instance you call it on is implicitly bound as the first argument. Naming the first argument self is a convention, not a strict requirement - the instance is being bound to poly, which you then try to run a regexp over.
It looks me like there's some confusion in your design here about what's behavior of individual instances of the class and what's class-level or module-level behavior. In Python, it's perfectly acceptable to leave something that doesn't take an instance of a class as a parameter defined as a module-level function rather than shoehorning it in awkwardly. parsePolyToListInput might be one such function.
Your add implementation, similarly, has a comment saying it "accepts 2 lists as parameters". In fact, it's going to get a gf2poly instance as its first argument - this is probably right if you're planning to do operator overloading, but it means the second argument should also be a gf2poly instance as well.
EDIT:
Yeah, your example code shows a breakdown between class behavior and instance behavior. Either your multiply call should look something like this:
print "multiply: ",f1.multiply(f3)
Or multiply shouldn't be a method at all:
gfpoly.py:
def multiply(f1, f2):
a,b = prepBinary(a,b)
g = []; bitsa = "{0:b}".format(a)
[g.append((b<<i)*int(bit)) for i,bit in enumerate(bitsa)]
m = reduce(lambda x,y: x^y,g); z = "{0:b}".format(m)
return z #returns product of 2 polynomials in gf2
That latter approach is, for instance, how the standard math library does things.
The advantage of defining a multiplication method is that you could name it appropriately (http://docs.python.org/2/reference/datamodel.html#special-method-names) and use it with the * operator:
print "multiply: ",f1 *f3

How can memoization be applied to this algorithm?

After finding the difflib.SequenceMatcher class in Python's standard library to be unsuitable for my needs, a generic "diff"-ing module was written to solve a problem space. After having several months to think more about what it is doing, the recursive algorithm appears to be searching more than in needs to by re-searching the same areas in a sequence that a separate "search thread" may have also examined.
The purpose of the diff module is to compute the difference and similarities between a pair of sequences (list, tuple, string, bytes, bytearray, et cetera). The initial version was much slower than the code's current form, having seen a speed increase by a factor of ten. How can memoization be applied to the following code? What is the best way to rewrite the algorithm to further increase any possible speed?
class Slice:
__slots__ = 'prefix', 'root', 'suffix'
def __init__(self, prefix, root, suffix):
self.prefix = prefix
self.root = root
self.suffix = suffix
################################################################################
class Match:
__slots__ = 'a', 'b', 'prefix', 'suffix', 'value'
def __init__(self, a, b, prefix, suffix, value):
self.a = a
self.b = b
self.prefix = prefix
self.suffix = suffix
self.value = value
################################################################################
class Tree:
__slots__ = 'nodes', 'index', 'value'
def __init__(self, nodes, index, value):
self.nodes = nodes
self.index = index
self.value = value
################################################################################
def search(a, b):
# Initialize startup variables.
nodes, index = [], []
a_size, b_size = len(a), len(b)
# Begin to slice the sequences.
for size in range(min(a_size, b_size), 0, -1):
for a_addr in range(a_size - size + 1):
# Slice "a" at address and end.
a_term = a_addr + size
a_root = a[a_addr:a_term]
for b_addr in range(b_size - size + 1):
# Slice "b" at address and end.
b_term = b_addr + size
b_root = b[b_addr:b_term]
# Find out if slices are equal.
if a_root == b_root:
# Create prefix tree to search.
a_pref, b_pref = a[:a_addr], b[:b_addr]
p_tree = search(a_pref, b_pref)
# Create suffix tree to search.
a_suff, b_suff = a[a_term:], b[b_term:]
s_tree = search(a_suff, b_suff)
# Make completed slice objects.
a_slic = Slice(a_pref, a_root, a_suff)
b_slic = Slice(b_pref, b_root, b_suff)
# Finish the match calculation.
value = size + p_tree.value + s_tree.value
match = Match(a_slic, b_slic, p_tree, s_tree, value)
# Append results to tree lists.
nodes.append(match)
index.append(value)
# Return largest matches found.
if nodes:
return Tree(nodes, index, max(index))
# Give caller null tree object.
return Tree(nodes, index, 0)
Reference: How to optimize a recursive algorithm to not repeat itself?
As ~unutbu said, try the memoized decorator and the following changes:
#memoized
def search(a, b):
# Initialize startup variables.
nodes, index = [], []
a_size, b_size = len(a), len(b)
# Begin to slice the sequences.
for size in range(min(a_size, b_size), 0, -1):
for a_addr in range(a_size - size + 1):
# Slice "a" at address and end.
a_term = a_addr + size
a_root = list(a)[a_addr:a_term] #change to list
for b_addr in range(b_size - size + 1):
# Slice "b" at address and end.
b_term = b_addr + size
b_root = list(b)[b_addr:b_term] #change to list
# Find out if slices are equal.
if a_root == b_root:
# Create prefix tree to search.
a_pref, b_pref = list(a)[:a_addr], list(b)[:b_addr]
p_tree = search(a_pref, b_pref)
# Create suffix tree to search.
a_suff, b_suff = list(a)[a_term:], list(b)[b_term:]
s_tree = search(a_suff, b_suff)
# Make completed slice objects.
a_slic = Slice(a_pref, a_root, a_suff)
b_slic = Slice(b_pref, b_root, b_suff)
# Finish the match calculation.
value = size + p_tree.value + s_tree.value
match = Match(a_slic, b_slic, p_tree, s_tree, value)
# Append results to tree lists.
nodes.append(match)
index.append(value)
# Return largest matches found.
if nodes:
return Tree(nodes, index, max(index))
# Give caller null tree object.
return Tree(nodes, index, 0)
For memoization, dictionaries are best, but they cannot be sliced, so they have to be changed to lists as indicated in the comments above.
You could use the memoize decorator from the Python Decorator Library
and use it like this:
#memoized
def search(a, b):
The first time you call search with arguments a,b, the result is calculated and memoized (saved in a cache). The second time search is called with the same arguments, the result in returned from the cache.
Note that for the memoized decorator to work, the arguments must be hashable. If a and b are tuples of numbers, then they are hashable. If they are lists then you could convert them to tuples before passing them to search.
It doesn't look like search takes dicts as arguments, but if they were, then they would not be hashable and the memoization decorator would not be able to save the result in the cache.
It has been over 9 years since the question was asked, but the concept of internally caching results to speed up the algorithm was finally applied to the code today. The results of this application can be seen below:
#! /usr/bin/env python3
"""Compute differences and similarities between a pair of sequences.
After finding the "difflib.SequenceMatcher" class unsuitable, this module
was written and re-written several times into the polished version below."""
__author__ = 'Stephen "Zero" Chappell <Noctis.Skytower#gmail.com>'
__date__ = '3 September 2019'
__version__ = '$Revision: 4 $'
class Slice:
__slots__ = 'prefix', 'root', 'suffix'
def __init__(self, prefix, root, suffix):
self.prefix = prefix
self.root = root
self.suffix = suffix
class Match:
__slots__ = 'a', 'b', 'prefix', 'suffix', 'value'
def __init__(self, a, b, prefix, suffix, value):
self.a = a
self.b = b
self.prefix = prefix
self.suffix = suffix
self.value = value
class Tree:
__slots__ = 'nodes', 'index', 'value'
def __init__(self, nodes, index, value):
self.nodes = nodes
self.index = index
self.value = value
def search(a, b):
return _search(a, b, {})
def _search(a, b, memo):
# Initialize startup variables.
nodes, index = [], []
a_size, b_size = len(a), len(b)
# Begin to slice the sequences.
for size in range(min(a_size, b_size), 0, -1):
for a_addr in range(a_size - size + 1):
# Slice "a" at address and end.
a_term = a_addr + size
a_root = a[a_addr:a_term]
for b_addr in range(b_size - size + 1):
# Slice "b" at address and end.
b_term = b_addr + size
b_root = b[b_addr:b_term]
# Find out if slices are equal.
if a_root == b_root:
# Create prefix tree to search.
key = a_prefix, b_prefix = a[:a_addr], b[:b_addr]
if key not in memo:
memo[key] = _search(a_prefix, b_prefix, memo)
p_tree = memo[key]
# Create suffix tree to search.
key = a_suffix, b_suffix = a[a_term:], b[b_term:]
if key not in memo:
memo[key] = _search(a_suffix, b_suffix, memo)
s_tree = memo[key]
# Make completed slice objects.
a_slice = Slice(a_prefix, a_root, a_suffix)
b_slice = Slice(b_prefix, b_root, b_suffix)
# Finish the match calculation.
value = size + p_tree.value + s_tree.value
match = Match(a_slice, b_slice, p_tree, s_tree, value)
# Append results to tree lists.
nodes.append(match)
index.append(value)
# Return largest matches found.
if nodes:
return Tree(nodes, index, max(index))
# Give caller null tree object.
return Tree(nodes, index, 0)

Categories