How to add the grad method to a theano Op? - python

I have create a theano.Op that returns distance between each pair of the two collections of inputs, converting the scipy cdist:
class Cdist(theano.Op):
__props__ = ()
def __init__(self):
#self.fn = scipy_cdist2
super(Cdist, self).__init__()
def make_node(self, x, w):
#print('make_node')
return gof.Apply(self, [x, w], [x.type()])
def perform(self, node, inputs, output_storage):
#print('perform')
x, w = inputs[0], inputs[1]
z = output_storage[0]
z[0] = distance.cdist(x, w, 'euclidean')
It works, but now want to add the grad method. I have read the guide and the documentation about the grad method. But i still dont't understand how it works. For example in the guide to get the gradient of a method that return a*x + b, they use:
def grad(self, inputs, output_grads):
return [a * output_grads[0] + b]
why? I'm going to quote what is written in the documentation about the grad:
If the output list of the op is [f_1, ... f_n], then the list
output_gradients is [grad_{f_1}(C), grad_{f_2}(C), ... ,
grad_{f_n}(C)]. If inputs consists of the list [x_1, ..., x_m], then
Op.grad should return the list [grad_{x_1}(C), grad_{x_2}(C), ...,
grad_{x_m}(C)], where (grad_{y}(Z))_i = \frac{\partial Z}{\partial
y_i} (and i can stand for multiple dimensions).
They are told me that i have to write the gradient? But in the example the make a combination of output_grads and integger values. Really i'm not understanding.

There's nothing wrong about docs. In grad method you should write a symbolic expression, as opposed to perform method where you write a numerical expression.
grad method is called from theano.grad, while perform is called inside the compiled function.
For example, assuming euclidean distance:
def grad(self, inputs, out_grads):
x, y = inputs # matrices of shape [mA, n] and [mB, n]]
g, = out_grads # matrix of shape [mA, mB]
diff = x.dimshuffle(0, 'x', 1) - y.dimshuffle('x', 0, 1) # [mA, mB, n] tensor
z = T.sqrt(T.sum(T.sqr(diff), axis=2, keepdims=True))
diff = g * diff / z
return [T.sum(diff, axis=1), -T.sum(diff, axis=0)]
For this particular case, I'd suggest writing a L_op instead of grad. L_op additionally reuses output in the forward Op.
def L_op(self, inputs, outputs, out_grads):
x, y = inputs # matrices of shape [mA, n] and [mB, n]
z, = outputs # matrix of shape [mA, mB]
g, = out_grads # idem
diff = x.dimshuffle(0, 'x', 1) - y.dimshuffle('x', 0, 1) # [mA, mB, n] tensor
diff = g.dimshuffle(0, 1, 'x') * diff / z.dimshuffle(0, 1, 'x')
return [T.sum(diff, axis=1), -T.sum(diff, axis=0)]
Well, the grad expressions are probably wrong but you get the idea.
As you can see, we are calling symbolic functions such as dimshuffle. However there are cases where you want to write a class for grad Op. Either because the symbolic graph is too inefficient or you want a custom gradient.
For example:
class CDistGrad(theano.Op):
def __init__(...):
# <...>
pass
def c_code(...):
# implement this in case you want more performance
pass
def perform(...):
# <...>
pass
def make_node(...):
# <...>
pass
class CDist(theano.Op):
# <...>
def grad(self, inputs, output_grads):
return CDistGrad()(*inputs, *output_grads)
Still, symbolic expression is used in grad method. Just a custom Op replaced vanilla Theano expression.

Related

Understanding JAX argnums parameter in its gradient function

I'm trying to understand the behaviour of argnums in JAX's gradient function.
Suppose I have the following function:
def make_mse(x, t):
def mse(w,b):
return np.sum(jnp.power(x.dot(w) + b - t, 2))/2
return mse
And I'm taking the gradient in the following way:
w_gradient, b_gradient = grad(make_mse(train_data, y), (0,1))(w,b)
argnums= (0,1) in this case, but what does it mean? With respect to which variables the gradient is calculated? What will be the difference if I will use argnums=0 instead?
Also, can I use the same function to get the Hessian matrix?
I looked at JAX help section about it, but couldn't figure it out
When you pass multiple argnums to grad, the result is a function that returns a tuple of gradients, equivalent to if you had computed each separately:
def f(x, y):
return x ** 2 + x * y + y ** 2
df_dxy = grad(f, argnums=(0, 1))
df_dx = grad(f, argnums=0)
df_dy = grad(f, argnums=1)
x = 3.0
y = 4.25
assert df_dxy(x, y) == (df_dx(x, y), df_dy(x, y))
If you want to compute a mixed second derivatives, you can do this by repeatedly applying the gradient:
d2f_dxdy = grad(grad(f, argnums=0), argnums=1)
assert d2f_dxdy(x, y) == 1

I'm getting a ValueError when writing a method plot for Newton's method

I have an assignment for school. First of all can you help me with confirming I have interpreted the question right? And also does the code seem somewhat ok? There have been other tasks before this like create the class with a two dimensional function, write the newton method and so on. And now this question. Im not finished programming it, but Im a bit stuck and I feel like I dont know exactly what to do. On what do I run my Newton method? On the point P. Do I create it like I have done in the Plot method??
This is the question:
Write a method plot that checks the dependence of Newton’s method on
several initial vectors x0. This method should plot what is described
in the following steps:
• Use the meshgrid command to set up a grid of
N2 points in the set G = [a, b]×[c, d] (the parameters N, a, b, c and
d are parameters of the methods). You obtain two matrices X and Y
where a specific grid point is defined as pij = (Xij , Yij )
class fractals2D(object):
Allzeroes = [] #a list to add all stored values from each run of newtons method
def __init__(self,f, x):
self.f=f
f0 = self.f(x) #giving a variable name with the function to use in ckass
n=len(x) #for size of matrice
jac=zeros([n]) #creates an array to use for jacobian matrice
h=1.e-8 #to set h for derivative
self.jac = jac
for i in range(n): #creating loop to take partial derivatives of x and y from x in f
temp=x[i]
#print(x[i])
x[i]=temp +h #why setting x[i] two times?
#print(x[i])
f1=f(x)
x[i]=temp
#print(x[i])
jac[:,i]=(f1-f0)/h
def Newtons_method(self,guess):
f_val = f(guess)
self.guess = guess
for i in range(40):
delta = solve(self.jac,-f_val)
guess = guess +delta
if norm((delta),ord=2)<1.e-9:
return guess #alist for storing zeroes from one run
def ZeroesMethod(self, point):
point = self.guess
self.Newtons_method(point)
#adds zeroes from the run of newtons to a list to store them all
self.Allzeroes.append(self.guess)
return (len(self.Allzeroes)) #returns how many zeroes are found
def plot(self, N, a, b, c, d):
x = np.linspace(a, b, N)
y = np.linspace(c, d, N)
P = [X, Y] = np.meshgrid(x, y)
return P #calling ZeroesMethos with our newly meshed point of several arrays
x0 = array([2.0, 1.0]) #creates an x and y value?
x1= array([1, -5])
a= array([2, 8])
b = array([-2, -6])
def f(x):
f = np.array(
[x[0]**2 - x[1] + x[0]*cos(pi*x[0]),
x[0]*x[1] + exp(-x[1]) - x[0]**(-1)])
This is the errormessage im receiving:
delta = solve(self.jac,-f_val)
TypeError: bad operand type for unary -: 'NoneTyp

Predict function for multiple linear regression

I am trying to make a predict function for a homework problem where it takes the dot products of a matrix(x) and a vector(y) and inserts them into a numpy array
def predict(x, y):
y_hat = np.empty
for j in range(len(y)):
y_hat[i] = np.dot(x, y)
return y_hat
There is an error message on y_hat[i] = np.dot(x,y)
There are two errors in the code:
numpy.empty() is a method which get arguments for the shape. Here, you must define it as np.empty([len(y), len(x)]) (if x is matrix and y is a vector,np.dot(x, y) results a vector with length len(x)). It produces a placeholder for np.dot() resulted arrays.
variable i is not defined.
so:
def predict(x, y):
y_hat = np.empty([len(y), len(x)])
for j in range(len(y)):
y_hat[j] = np.dot(x, y)
return y_hat

Higher order gradients in pytorch

I have implemented the following Jacobian function in pytorch. Unless I have made a mistake, it computes the Jacobian of any tensor w.r.t. any dimensional inputs:
import torch
import torch.autograd as ag
def nd_range(stop, dims = None):
if dims == None:
dims = len(stop)
if not dims:
yield ()
return
for outer in nd_range(stop, dims - 1):
for inner in range(stop[dims - 1]):
yield outer + (inner,)
def full_jacobian(f, wrt):
f_shape = list(f.size())
wrt_shape = list(wrt.size())
fs = []
f_range = nd_range(f_shape)
wrt_range = nd_range(wrt_shape)
for f_ind in f_range:
grad = ag.grad(f[tuple(f_ind)], wrt, retain_graph=True, create_graph=True)[0]
for i in range(len(f_shape)):
grad = grad.unsqueeze(0)
fs.append(grad)
fj = torch.cat(fs, dim=0)
fj = fj.view(f_shape + wrt_shape)
return fj
On top of this, I have tried to implement a recursive function to calculate nth order derivatives:
def nth_derivative(f, wrt, n):
if n == 1:
return full_jacobian(f, wrt)
else:
deriv = nth_derivative(f, wrt, n-1)
return full_jacobian(deriv, wrt)
I ran a simple test:
op = torch.ger(s, s)
deep_deriv = nth_derivative(op, s, 5)
Unfortunately, this succeeds in getting me the Hessian...but no higher order derivatives. I'm aware many higher order derivatives should be 0, but I'd prefer if pytorch can analytically compute that.
One fix has been to change the gradient calculation to:
try:
grad = ag.grad(f[tuple(f_ind)], wrt, retain_graph=True, create_graph=True)[0]
except:
grad = torch.zeros_like(wrt)
Is this the accepted correct way to handle this? Or is there a better option? Or do I have the reason for my issue completely wrong to begin with?
You can just iterate calling the grad function:
import torch
from torch.autograd import grad
def nth_derivative(f, wrt, n):
for i in range(n):
grads = grad(f, wrt, create_graph=True)[0]
f = grads.sum()
return grads
x = torch.arange(4, requires_grad=True).reshape(2, 2)
loss = (x ** 4).sum()
print(nth_derivative(f=loss, wrt=x, n=3))
outputs
tensor([[ 0., 24.],
[ 48., 72.]])
For the second order derivative, you can use PyTorch's hessian function:
torch.autograd.functional.hessian()
For higher order derivatives, you can repeatedly call jacobian or grad while maintaining the computational graph:
create_graph (bool, optional) – If True, graph of the derivative will be constructed, allowing to compute higher order derivative products.

Error in backpropagation python neural net

Darn thing just won't learn. Sometimes weights seem to become nan.
I haven't played with different numbers of hidden layers/inputs/outputs but the bug appears consistent across different sizes of hidden layer.
from __future__ import division
import numpy
import matplotlib
import random
class Net:
def __init__(self, *sizes):
sizes = list(sizes)
sizes[0] += 1
self.sizes = sizes
self.weights = [numpy.random.uniform(-1, 1, (sizes[i+1],sizes[i])) for i in range(len(sizes)-1)]
#staticmethod
def activate(x):
return 1/(1+numpy.exp(-x))
def y(self, x_):
x = numpy.concatenate(([1], numpy.atleast_1d(x_.copy())))
o = [x] #o[i] is the (activated) output of hidden layer i, "hidden layer 0" is inputs
for weight in self.weights[:-1]:
x = weight.dot(x)
x = Net.activate(x)
o.append(x)
o.append(self.weights[-1].dot(x))
return o
def __call__(self, x):
return self.y(x)[-1]
def delta(self, x, t):
o = self.y(x)
delta = [(o[-1]-t) * o[-1] * (1-o[-1])]
for i, weight in enumerate(reversed(self.weights)):
delta.append(weight.T.dot(delta[-1]) * o[-i-2] * (1-o[-i-2]))
delta.reverse()
return o, delta
def train(self, inputs, outputs, epochs=100, rate=.1):
for epoch in range(epochs):
pairs = zip(inputs, outputs)
random.shuffle(pairs)
for x, t in pairs: #shuffle? subset?
o, d = self.delta(x, t)
for layer in range(len(self.sizes)-1):
self.weights[layer] -= rate * numpy.outer(o[layer+1], d[layer])
n = Net(1, 4, 1)
x = numpy.linspace(0, 2*3.14, 10)
t = numpy.sin(x)
matplotlib.pyplot.plot(x, t, 'g')
matplotlib.pyplot.plot(x, map(n, x), 'r')
n.train(x, t)
print n.weights
matplotlib.pyplot.plot(x, map(n, x), 'b')
matplotlib.pyplot.show()
I haven't looked for a particular bug in your code, but can you please try the following things to narrow down your problem further? Otherwise it is very tedious to find the needle in the haystack.
1) Please try to use a real dataset to have an idea what to expect, e.g., MNIST, and/or standardize your data, because your weights may become NaN if they become too small.
2) Try different learning rates and plot the cost function vs. epochs to check if you are converging. It should look somewhat like this (note that I used minibatch learning and averaged the minibatch chunks for each epoch).
3) I see that you are using a sigmoid activation, your implementation is correct, but to make it numerically more stable, replace 1.0 / (1.0 + np.exp(-z)) by expit(z) from scipy.special (same function but more efficient).
4) Implement gradient checking. Here, you compare the analytical solution to a numerically approximated gradient
Or an even better approach that yields a more accurate approximation of the gradient is to compute the symmetric (or centered) difference quotient given by the two-point formula
PS: If you are interested and find it useful, I have a working vanilla NumPy neural net implemented here.
I fixed it! Thanks for all the suggestions. I worked out numeric partials and found that my o and deltas were correct, but I was multiplying the wrong ones. That's why I now take numpy.outer(d[layer+1], o[layer]) instead of numpy.outer(d[layer], o[layer+1]).
I was also skipping the update on one layer. That's why I changed for layer in range(self.hidden_layers) to for layer in range(self.hidden_layers+1).
I'll add that I caught a bug just before posting originally. My output layer delta was incorrect because my net (intentionally) doesn't activate the final outputs, but my delta was computed as though it did.
Debugged primarily with a one hidden layer, one hidden unit net, then moved to a 2 input, 3 hidden layers of 2 neurons each, 2 output model.
from __future__ import division
import numpy
import scipy
import scipy.special
import matplotlib
#from pylab import *
#numpy.random.seed(23)
def nmap(f, x):
return numpy.array(map(f, x))
class Net:
def __init__(self, *sizes):
self.hidden_layers = len(sizes)-2
self.weights = [numpy.random.uniform(-1, 1, (sizes[i+1],sizes[i])) for i in range(self.hidden_layers+1)]
#staticmethod
def activate(x):
return scipy.special.expit(x)
#return 1/(1+numpy.exp(-x))
#staticmethod
def activate_(x):
s = scipy.special.expit(x)
return s*(1-s)
def y(self, x):
o = [numpy.array(x)] #o[i] is the (activated) output of hidden layer i, "hidden layer 0" is inputs and not activated
for weight in self.weights[:-1]:
o.append(Net.activate(weight.dot(o[-1])))
o.append(self.weights[-1].dot(o[-1]))
# for weight in self.weights:
# o.append(Net.activate(weight.dot(o[-1])))
return o
def __call__(self, x):
return self.y(x)[-1]
def delta(self, x, t):
x = numpy.array(x)
t = numpy.array(t)
o = self.y(x)
#delta = [(o[-1]-t) * o[-1] * (1-o[-1])]
delta = [o[-1]-t]
for i, weight in enumerate(reversed(self.weights)):
delta.append(weight.T.dot(delta[-1]) * o[-i-2] * (1-o[-i-2]))
delta.reverse() #surely i need this
return o, delta
def train(self, inputs, outputs, epochs=1000, rate=.1):
errors = []
for epoch in range(epochs):
for x, t in zip(inputs, outputs): #shuffle? subset?
o, d = self.delta(x, t)
for layer in range(self.hidden_layers+1):
grad = numpy.outer(d[layer+1], o[layer])
self.weights[layer] -= rate * grad
return errors
def rmse(self, inputs, outputs):
return ((outputs - nmap(self, inputs))**2).sum()**.5/len(inputs)
n = Net(1, 8, 1)
X = numpy.linspace(0, 2*3.1415, 10)
T = numpy.sin(X)
Y = map(n, X)
Y = numpy.array([y[0,0] for y in Y])
matplotlib.pyplot.plot(X, T, 'g')
matplotlib.pyplot.plot(X, Y, 'r')
print 'output successful'
print n.rmse(X, T)
errors = n.train(X, T)
print 'tried to train successfully'
print n.rmse(X, T)
Y = map(n, X)
Y = numpy.array([y[0,0] for y in Y])
matplotlib.pyplot.plot(x, Y, 'b')
matplotlib.pyplot.show()

Categories