I am trying to define a custom theano Op with a gradient to use it with pymc3 but I don't understand how to define the grad method.
The code below is where I'm stuck. The function phi() is a mock function (in practice, it is an external program); for a scalar input x it returns a vector (phi_0(x), phi_1(x), ...). The function phi_diff() (also a mock function) returns the vector (dphi_0/dx, dphi_1/dx, ...).
I wrapped phi() and phi_diff() in a theano.Op object but my implementation of the grad function does not work. The documentation of theano contains simpler examples, I don't understand how to adapt them in this case. Any help would be greatly appreciated.
import numpy as np
import theano.tensor as T
import theano
theano.config.optimizer = "None"
theano.config.exception_verbosity = "high"
def phi(x):
return np.arange(n) * x
def phi_diff(x):
return np.arange(n)
class PhiOp(theano.Op):
itypes = [theano.tensor.dscalar]
otypes = [theano.tensor.dvector]
def perform(self, node, inputs, output_storage):
x = inputs[0]
output_storage[0][0] = phi(x)
def grad(self, inputs, output_grads):
x = inputs[0]
# ???
return [PhiDiffOp()(x) * output_grads[0]]
class PhiDiffOp(theano.Op):
itypes = [theano.tensor.dscalar]
otypes = [theano.tensor.dvector]
def perform(self, node, inputs, output_storage):
x = inputs[0]
output_storage[0][0] = phi_diff(x)
n = 5
x = 777.
phi_op = PhiOp()
x_tensor = T.dscalar("x_tensor")
phi_func = theano.function([x_tensor], phi_op(x_tensor))
np.testing.assert_allclose(phi_func(x), phi(x))
T.jacobian(phi_op(x_tensor), x_tensor)
Found the solution, changes below:
def phi_diff(x):
return np.arange(n, dtype=np.float_)
class PhiOp(theano.Op):
def grad(self, inputs, output_grads):
x = inputs[0]
gg = (PhiDiffOp()(x) * output_grads[0]).sum()
return [gg]
Related
I want to add a decorator to the function which requires 7 arguments such as:
n = 1000000
greyscales = np.floor(np.random.uniform(0, 255, n).astype(np.float32))
weights = np.random.normal(.5, .1, n).astype(np.float32)
exp
#vectorize('float32(float32)', target = 'cuda')
def normalize(greyscales):
return greyscales / 255
#vectorize('float32(float32,float32)', target = 'cuda')
def weigh(values, weights):
return values * weights
#vectorize('float32(float32)', target = 'cuda')
def activate(values):
a = exp(values)
b = exp(-values)
return (a - b) / (a + b)
And the function itself looks like this:
#vectorize('float32(int,float32,float32,float32,float32,float32,float32)', target='cuda')
def create_hidden_layer(n, greyscales, weights, exp, normalize, weigh, activate):
normalized = normalize(greyscales)
weighted = weigh(normalized, weights)
activated = activate(weighted)
return activated
I got stuck with choosing signatures for #vectorize, especially don't know how to define n and exp. Maybe someone could help me?
As Rutger Kassies has also said, there does not seem to be a good reason why the create_hidden_layer function has to take the other functions as an input. So you can simply remove that part to make it work,also you should note a couple of other things:
numba can infer which types a function accepts/outputs, so specifying them is not mandatory.
"EXP" whatever that means, just use the numpy library exp function, which already has very good performance.
This version of the code should work: (I have removed the cuda part to make it work on my machine):
import numpy as np
from numba import vectorize
n = 1000000
greyscales = np.floor(np.random.uniform(0, 255, n).astype(np.float32))
weights = np.random.normal(.5, .1, n).astype(np.float32)
#vectorize('float32(float32)')
def normalize(greyscales):
return greyscales / 255
#vectorize('float32(float32,float32)')
def weigh(values, weights):
return values * weights
#vectorize('float32(float32)')
def activate(values):
a = np.exp(values)
b = np.exp(-values)
return (a - b) / (a + b)
#vectorize()
def create_hidden_layer(n, greyscales, weights):
normalized = normalize(greyscales)
weighted = weigh(normalized, weights)
activated = activate(weighted)
return activated
create_hidden_layer(n, greyscales, weights)
I have implemented the following Jacobian function in pytorch. Unless I have made a mistake, it computes the Jacobian of any tensor w.r.t. any dimensional inputs:
import torch
import torch.autograd as ag
def nd_range(stop, dims = None):
if dims == None:
dims = len(stop)
if not dims:
yield ()
return
for outer in nd_range(stop, dims - 1):
for inner in range(stop[dims - 1]):
yield outer + (inner,)
def full_jacobian(f, wrt):
f_shape = list(f.size())
wrt_shape = list(wrt.size())
fs = []
f_range = nd_range(f_shape)
wrt_range = nd_range(wrt_shape)
for f_ind in f_range:
grad = ag.grad(f[tuple(f_ind)], wrt, retain_graph=True, create_graph=True)[0]
for i in range(len(f_shape)):
grad = grad.unsqueeze(0)
fs.append(grad)
fj = torch.cat(fs, dim=0)
fj = fj.view(f_shape + wrt_shape)
return fj
On top of this, I have tried to implement a recursive function to calculate nth order derivatives:
def nth_derivative(f, wrt, n):
if n == 1:
return full_jacobian(f, wrt)
else:
deriv = nth_derivative(f, wrt, n-1)
return full_jacobian(deriv, wrt)
I ran a simple test:
op = torch.ger(s, s)
deep_deriv = nth_derivative(op, s, 5)
Unfortunately, this succeeds in getting me the Hessian...but no higher order derivatives. I'm aware many higher order derivatives should be 0, but I'd prefer if pytorch can analytically compute that.
One fix has been to change the gradient calculation to:
try:
grad = ag.grad(f[tuple(f_ind)], wrt, retain_graph=True, create_graph=True)[0]
except:
grad = torch.zeros_like(wrt)
Is this the accepted correct way to handle this? Or is there a better option? Or do I have the reason for my issue completely wrong to begin with?
You can just iterate calling the grad function:
import torch
from torch.autograd import grad
def nth_derivative(f, wrt, n):
for i in range(n):
grads = grad(f, wrt, create_graph=True)[0]
f = grads.sum()
return grads
x = torch.arange(4, requires_grad=True).reshape(2, 2)
loss = (x ** 4).sum()
print(nth_derivative(f=loss, wrt=x, n=3))
outputs
tensor([[ 0., 24.],
[ 48., 72.]])
For the second order derivative, you can use PyTorch's hessian function:
torch.autograd.functional.hessian()
For higher order derivatives, you can repeatedly call jacobian or grad while maintaining the computational graph:
create_graph (bool, optional) – If True, graph of the derivative will be constructed, allowing to compute higher order derivative products.
I have create a theano.Op that returns distance between each pair of the two collections of inputs, converting the scipy cdist:
class Cdist(theano.Op):
__props__ = ()
def __init__(self):
#self.fn = scipy_cdist2
super(Cdist, self).__init__()
def make_node(self, x, w):
#print('make_node')
return gof.Apply(self, [x, w], [x.type()])
def perform(self, node, inputs, output_storage):
#print('perform')
x, w = inputs[0], inputs[1]
z = output_storage[0]
z[0] = distance.cdist(x, w, 'euclidean')
It works, but now want to add the grad method. I have read the guide and the documentation about the grad method. But i still dont't understand how it works. For example in the guide to get the gradient of a method that return a*x + b, they use:
def grad(self, inputs, output_grads):
return [a * output_grads[0] + b]
why? I'm going to quote what is written in the documentation about the grad:
If the output list of the op is [f_1, ... f_n], then the list
output_gradients is [grad_{f_1}(C), grad_{f_2}(C), ... ,
grad_{f_n}(C)]. If inputs consists of the list [x_1, ..., x_m], then
Op.grad should return the list [grad_{x_1}(C), grad_{x_2}(C), ...,
grad_{x_m}(C)], where (grad_{y}(Z))_i = \frac{\partial Z}{\partial
y_i} (and i can stand for multiple dimensions).
They are told me that i have to write the gradient? But in the example the make a combination of output_grads and integger values. Really i'm not understanding.
There's nothing wrong about docs. In grad method you should write a symbolic expression, as opposed to perform method where you write a numerical expression.
grad method is called from theano.grad, while perform is called inside the compiled function.
For example, assuming euclidean distance:
def grad(self, inputs, out_grads):
x, y = inputs # matrices of shape [mA, n] and [mB, n]]
g, = out_grads # matrix of shape [mA, mB]
diff = x.dimshuffle(0, 'x', 1) - y.dimshuffle('x', 0, 1) # [mA, mB, n] tensor
z = T.sqrt(T.sum(T.sqr(diff), axis=2, keepdims=True))
diff = g * diff / z
return [T.sum(diff, axis=1), -T.sum(diff, axis=0)]
For this particular case, I'd suggest writing a L_op instead of grad. L_op additionally reuses output in the forward Op.
def L_op(self, inputs, outputs, out_grads):
x, y = inputs # matrices of shape [mA, n] and [mB, n]
z, = outputs # matrix of shape [mA, mB]
g, = out_grads # idem
diff = x.dimshuffle(0, 'x', 1) - y.dimshuffle('x', 0, 1) # [mA, mB, n] tensor
diff = g.dimshuffle(0, 1, 'x') * diff / z.dimshuffle(0, 1, 'x')
return [T.sum(diff, axis=1), -T.sum(diff, axis=0)]
Well, the grad expressions are probably wrong but you get the idea.
As you can see, we are calling symbolic functions such as dimshuffle. However there are cases where you want to write a class for grad Op. Either because the symbolic graph is too inefficient or you want a custom gradient.
For example:
class CDistGrad(theano.Op):
def __init__(...):
# <...>
pass
def c_code(...):
# implement this in case you want more performance
pass
def perform(...):
# <...>
pass
def make_node(...):
# <...>
pass
class CDist(theano.Op):
# <...>
def grad(self, inputs, output_grads):
return CDistGrad()(*inputs, *output_grads)
Still, symbolic expression is used in grad method. Just a custom Op replaced vanilla Theano expression.
Hi I am trying to build a simple autoencoder model in Theano but I also want to optimize using Scipy Optimize Minimize. I found a code here : http://dlacombejr.github.io/programming/2015/09/13/sparse-filtering-implemenation-in-theano.html which I modified a bit. The problem is since scipy minimum takes in an array, but I need to optimize more that one weight parameters. what is the solution to this problem if I want to keep using the same structure as the code in the link?
I found a possible way of getting around this problem, that is to pass in all the weights as a single array flattened and appended, and then assigning parts of the array to various variables, according to this https://github.com/Theano/Theano/issues/3131
import theano
from theano import tensor as t
import numpy as np
class SparseFilter(object):
def __init__(self, theta, dims, x):
# assign inputs to sparse filter
self.theta = theta
self.x = x
self.dims = dims
self.w = theano.shared(np.random.randn(self.dims[1], self.dims[0]), name='w')
self.w = t.reshape(self.theta[0:(self.dims[0]*self.dims[1])], self.dims)
self.b = theano.shared(np.random.randn(self.dims[1]), name='b')
self.b = t.reshape(self.theta[(self.dims[0]*self.dims[1]):self.theta.get_value().shape[0]], (self.dims[0],1))
# the feed-forward function is not fully written
def feed_forward(self):
f = t.dot(self.w, self.x.T) + self.b
return f
def get_cost_grads(self):
cost = t.sum(t.abs_(self.feed_forward()))
gradw = t.grad(cost=cost, wrt=self.w).flatten()
gradb = t.grad(cost=cost, wrt=self.b).flatten()
return cost, gradw, gradb
def training_functions(data, model, dims):
cost, grad = model.get_cost_grads()
fn = theano.function(inputs=[], outputs=[cost, grad],
givens={model.x: data}, allow_input_downcast=True)
def train_fn(theta_value):
# reshape the theta value for Theano and convert to float32
model.w = t.reshape(theta_value[0:(dims[0]*dims[1])], dims)
model.b = t.reshape(theta_value[(dims[0]*dims[1]):theta_value.shape[0]], (dims[0],1))
c, gw, gb = fn()
# convert values to float64 for SciPy
c = np.asarray(c, dtype=np.float64)
gw = np.asarray(gw, dtype=np.float64)
gb = np.asarray(gb, dtype=np.float64)
grad = np.append(gw, gb)
return c, grad
return train_fn
theta = theano.shared(np.random.randn((input_dim*hdim) + hdim), name='theta')
np.random.seed(0)
theta.set_value(np.random.randn((input_dim*hdim) + hdim).astype('float32'))
model = SparseFilter(theta, dims, x)
train_fn = training_functions(data, model, dims)
from scipy.optimize import minimize
weights = minimize(train_fn, model.theta.eval(),
method='L-BFGS-B', jac=True,
options={'maxiter': 100, 'disp': True})
There is no iteration and the oparation stops.
There is a message in the output
message: 'ABNORMAL_TERMINATION_IN_LNSRCH'
Any suggestion? Thank you.
How it can be that it works
g_W = T.grad(cost=cost, wrt=classifier.vparamW)
whereas this
H_W=T.hessian(cost=cost, wrt=classifier.vparamW)
gives NotImplementedError()
may it be that the problem in such cost function:
-T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
Here y is the vector of class labels from 0 to n-1 and
self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
I am unable to reproduce this problem with the limited code that has been provided. However, here is a fully working demo of T.grad and T.hessian.
import numpy
import theano
import theano.tensor as T
x = T.matrix()
w_flat = theano.shared(numpy.random.randn(3, 2).astype(theano.config.floatX).flatten())
w = w_flat.reshape((3, 2))
cost = T.pow(theano.dot(x, w), 2).sum()
g_w = T.grad(cost=cost, wrt=[w])
h_w = T.hessian(cost=cost, wrt=[w_flat])
f = theano.function([x], outputs=g_w + h_w)
for output in f(numpy.random.randn(4, 3).astype(theano.config.floatX)):
print output.shape, '\n', output
Note that the wrt value for T.hessian needs to be a vector.