I am supposed to calculate the Taylor series approximation of the function cos(x) + 1 using a TensorFlow custom gradient.
I wrote the following code:
def approx_cos_p1(x, n=7, dtype=tf.float32):
"""Return the approximation of cos(x) + 1 using a taylor/series expansion up to order 7"""
result = tf.constant(2, dtype)
for i in range(1, n//2+1):
if i % 2 == 1:
num=tf.math.pow(x, i*2)
den=math.factorial(i*2)
result=tf.math.subtract(result,tf.math.divide(num,den))
else:
num=tf.math.pow(x, i*2)
den=math.factorial(i*2)
result=tf.math.add(result,tf.math.divide(num,den))
return result
#tf.custom_gradient
def approx_cos_p1_custom_grad(x):
def backward(dy):
return approx_cos_p1(x)
return x, backward
x=tf.Variable(3.0,dtype=tf.float32)
with tf.GradientTape(persistent=True) as t:
output=approx_cos_p1_custom_grad(x)
print(t.gradient(output,x))
But according to the Tensorflow documentation, tf.custom_gradients should be used as follows:
#Establish an identity operation, but clip during the gradient pass
#tf.custom_gradient
def clip_gradients(y):
def backward(dy):
return tf.clip_by_norm(dy, 0.5)
return y, backward
v = tf.Variable(2.0)
with tf.GradientTape() as t:
output = clip_gradients(v * v)
print(t.gradient(output, v)) # calls "backward", which clips 4 to 2
The approx_cos_p1() function works perfectly.
The problem here is that the dy parameter in the function backward() is not being passed in approx_cos_p1() which is not as per the Tensorflow documentation. But I get the desired output as -3.15.
When I pass dy in approx_cos_p1(), I get an undesired output 1.14.
Is my implementation of the function correct?
Related
I've seen multiple SO answers such as SO1, SO2, SO3, SO4, SO5 but can't seem to get the answer I want.
I have a cost function that changes with a parameter (c_vir). I use the ln(c_vir) to be exact so that the minimizer doesn't go off to unexpected regions.
I get correct answers if the minima is way down than the cost function in general but it doesn't seem to converge if the function is slightly flat.
I use scipy local and global minimizers and also iminuit for minimizing my cost function.
Here is a graph that plots down the cost function (using a brute force for loop) and finds the minimum of the cost curve. It also shows the two minimizers and where they lie in accordance with the actual minimum.
Here is one of the cost functions using scipy Basinhopping method and Iminuit Simplex Method
Here is same cost function but now using scipy SHGO method and Iminuit Simplex Method
These are the synatax for my minimizers:
optres = iminuit.minimize(cost, [np.log(5)],
args=(den, eps, self.Mvir, self.Rvir, mask,
cost_func),
method='simplex',
bounds=(np.log(1e-5), np.log(50)),
tol=1e-2,
options={'stra': 2, 'maxfun': 500})
optres = so.basinhopping(cost, np.log(5),stepsize=1,
minimizer_kwargs={"method": "Nelder-Mead",
"args": (den, eps, self.Mvir, self.Rvir, mask, cost_func)})
optres = so.shgo(cost, bounds=[(np.log(1e-2), np.log(50))],
args=(den, eps, self.Mvir, self.Rvir, mask, cost_func),
sampling_method='sobol',
minimizer_kwargs={'method': 'Nelder-Mead'})
Changing the initial guess from np.log(5) to np.log(10) yields the same results and the bounds seem to only be constraining the more extreme cost function but not these type of almost flat functions.
Underlying Cost Function
#jit
def cost(lncvir, obs, epsilon, M, Rvir, mask, func="gaussian"): # theta is Rs, M, Rvir
Rs = Rvir / np.exp(lncvir)
# if lncvir < 0:
# return np.inf
# Rs = Rvir / lncvir
_, model = rho_r(Rs, M, Rvir, mask)
Cost = chisq(obs, model, epsilon, func)
return Cost
#njit(fastmath=True)
def chisq(obs: np.ndarray, model: np.ndarray, epsilon: float, func: str="gaussian"):
residual = obs - model
# residual ** 2 * cinv for every bin
if func == "gaussian":
return np.sum(np.square(residual) / np.square((epsilon * obs)))
elif func == "lorentz":
temp = np.square(residual) / np.square((epsilon * obs))
return np.sum(np.log(1 + temp))
elif func == 'abs':
return np.sum(np.abs(residual) / ((epsilon * obs)))
#njit(fastmath=True)
def rho_o(M: float, Rvir: float, Rs: float):
c = Rvir / Rs
ln_term = np.log(1.0 + c) - (c / (1.0 + c))
rho_not = M / (4.0 * np.pi * (Rs**3.0) * ln_term)
return rho_not
#njit()
def rho_r(Rs: float, M: float, Rvir: float, mask: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
r = RADIUS[mask]
term = r / Rs
rho_not = rho_o(M, Rvir, Rs)
return r, rho_not / (term * ((1.0 + term)**2.0))
I'm trying to understand the behaviour of argnums in JAX's gradient function.
Suppose I have the following function:
def make_mse(x, t):
def mse(w,b):
return np.sum(jnp.power(x.dot(w) + b - t, 2))/2
return mse
And I'm taking the gradient in the following way:
w_gradient, b_gradient = grad(make_mse(train_data, y), (0,1))(w,b)
argnums= (0,1) in this case, but what does it mean? With respect to which variables the gradient is calculated? What will be the difference if I will use argnums=0 instead?
Also, can I use the same function to get the Hessian matrix?
I looked at JAX help section about it, but couldn't figure it out
When you pass multiple argnums to grad, the result is a function that returns a tuple of gradients, equivalent to if you had computed each separately:
def f(x, y):
return x ** 2 + x * y + y ** 2
df_dxy = grad(f, argnums=(0, 1))
df_dx = grad(f, argnums=0)
df_dy = grad(f, argnums=1)
x = 3.0
y = 4.25
assert df_dxy(x, y) == (df_dx(x, y), df_dy(x, y))
If you want to compute a mixed second derivatives, you can do this by repeatedly applying the gradient:
d2f_dxdy = grad(grad(f, argnums=0), argnums=1)
assert d2f_dxdy(x, y) == 1
I have implemented the following Jacobian function in pytorch. Unless I have made a mistake, it computes the Jacobian of any tensor w.r.t. any dimensional inputs:
import torch
import torch.autograd as ag
def nd_range(stop, dims = None):
if dims == None:
dims = len(stop)
if not dims:
yield ()
return
for outer in nd_range(stop, dims - 1):
for inner in range(stop[dims - 1]):
yield outer + (inner,)
def full_jacobian(f, wrt):
f_shape = list(f.size())
wrt_shape = list(wrt.size())
fs = []
f_range = nd_range(f_shape)
wrt_range = nd_range(wrt_shape)
for f_ind in f_range:
grad = ag.grad(f[tuple(f_ind)], wrt, retain_graph=True, create_graph=True)[0]
for i in range(len(f_shape)):
grad = grad.unsqueeze(0)
fs.append(grad)
fj = torch.cat(fs, dim=0)
fj = fj.view(f_shape + wrt_shape)
return fj
On top of this, I have tried to implement a recursive function to calculate nth order derivatives:
def nth_derivative(f, wrt, n):
if n == 1:
return full_jacobian(f, wrt)
else:
deriv = nth_derivative(f, wrt, n-1)
return full_jacobian(deriv, wrt)
I ran a simple test:
op = torch.ger(s, s)
deep_deriv = nth_derivative(op, s, 5)
Unfortunately, this succeeds in getting me the Hessian...but no higher order derivatives. I'm aware many higher order derivatives should be 0, but I'd prefer if pytorch can analytically compute that.
One fix has been to change the gradient calculation to:
try:
grad = ag.grad(f[tuple(f_ind)], wrt, retain_graph=True, create_graph=True)[0]
except:
grad = torch.zeros_like(wrt)
Is this the accepted correct way to handle this? Or is there a better option? Or do I have the reason for my issue completely wrong to begin with?
You can just iterate calling the grad function:
import torch
from torch.autograd import grad
def nth_derivative(f, wrt, n):
for i in range(n):
grads = grad(f, wrt, create_graph=True)[0]
f = grads.sum()
return grads
x = torch.arange(4, requires_grad=True).reshape(2, 2)
loss = (x ** 4).sum()
print(nth_derivative(f=loss, wrt=x, n=3))
outputs
tensor([[ 0., 24.],
[ 48., 72.]])
For the second order derivative, you can use PyTorch's hessian function:
torch.autograd.functional.hessian()
For higher order derivatives, you can repeatedly call jacobian or grad while maintaining the computational graph:
create_graph (bool, optional) – If True, graph of the derivative will be constructed, allowing to compute higher order derivative products.
Given a set of points we'd like to find a straight line fitting the data best possible. I have implemented a version where the minimization of the cost function is done via gradient descent, and now I'd like to use the algorithm from scipy (scipy.optimize.minimize).
I tried this:
def gradDescVect(X, y, theta, alpha, iters):
m = shape(X)[0]
grad = copy(theta)
for c in range(0, iters):
error_sum = hypo(X, grad) - y
error_sum = X.T.dot(error_sum)
grad -= (alpha/m)*error_sum
return grad
def computeCostScipy(theta, X, y):
"""Compute cost, vectorized version"""
m = len(y)
term = hypo(X, theta) - y
print((term.T.dot(term) / (2 * m))[0, 0])
return (term.T.dot(term) / (2 * m))[0, 0]
def findMinTheta( theta, X, y):
result = scipy.optimize.minimize( computeCostScipy, theta, args=(X, y), method='BFGS', options={"maxiter":5000, "disp":True} )
return result.x, result.fun
This actually works pretty fine and gives results very close results to the original grad desc version.
The only problem seems that fminunc() stops execution before reaching minimum value of costFunction().
its sure that costFunction() works fine, and no. of iteration of minimize() is set greater than max iters it takes.
Output:
Optimization terminated successfully.
Current function value: 15.024985
Iterations: 2
Function evaluations: 16
Gradient evaluations: 4
Cost : 15.0249848024
Thteta : [ 0.15232531 0.93072285]
while the correct results are :
Cost : 4.48338825659
Theta : [[-3.63029144]
[ 1.16636235]]
See how close the results are:
Darn thing just won't learn. Sometimes weights seem to become nan.
I haven't played with different numbers of hidden layers/inputs/outputs but the bug appears consistent across different sizes of hidden layer.
from __future__ import division
import numpy
import matplotlib
import random
class Net:
def __init__(self, *sizes):
sizes = list(sizes)
sizes[0] += 1
self.sizes = sizes
self.weights = [numpy.random.uniform(-1, 1, (sizes[i+1],sizes[i])) for i in range(len(sizes)-1)]
#staticmethod
def activate(x):
return 1/(1+numpy.exp(-x))
def y(self, x_):
x = numpy.concatenate(([1], numpy.atleast_1d(x_.copy())))
o = [x] #o[i] is the (activated) output of hidden layer i, "hidden layer 0" is inputs
for weight in self.weights[:-1]:
x = weight.dot(x)
x = Net.activate(x)
o.append(x)
o.append(self.weights[-1].dot(x))
return o
def __call__(self, x):
return self.y(x)[-1]
def delta(self, x, t):
o = self.y(x)
delta = [(o[-1]-t) * o[-1] * (1-o[-1])]
for i, weight in enumerate(reversed(self.weights)):
delta.append(weight.T.dot(delta[-1]) * o[-i-2] * (1-o[-i-2]))
delta.reverse()
return o, delta
def train(self, inputs, outputs, epochs=100, rate=.1):
for epoch in range(epochs):
pairs = zip(inputs, outputs)
random.shuffle(pairs)
for x, t in pairs: #shuffle? subset?
o, d = self.delta(x, t)
for layer in range(len(self.sizes)-1):
self.weights[layer] -= rate * numpy.outer(o[layer+1], d[layer])
n = Net(1, 4, 1)
x = numpy.linspace(0, 2*3.14, 10)
t = numpy.sin(x)
matplotlib.pyplot.plot(x, t, 'g')
matplotlib.pyplot.plot(x, map(n, x), 'r')
n.train(x, t)
print n.weights
matplotlib.pyplot.plot(x, map(n, x), 'b')
matplotlib.pyplot.show()
I haven't looked for a particular bug in your code, but can you please try the following things to narrow down your problem further? Otherwise it is very tedious to find the needle in the haystack.
1) Please try to use a real dataset to have an idea what to expect, e.g., MNIST, and/or standardize your data, because your weights may become NaN if they become too small.
2) Try different learning rates and plot the cost function vs. epochs to check if you are converging. It should look somewhat like this (note that I used minibatch learning and averaged the minibatch chunks for each epoch).
3) I see that you are using a sigmoid activation, your implementation is correct, but to make it numerically more stable, replace 1.0 / (1.0 + np.exp(-z)) by expit(z) from scipy.special (same function but more efficient).
4) Implement gradient checking. Here, you compare the analytical solution to a numerically approximated gradient
Or an even better approach that yields a more accurate approximation of the gradient is to compute the symmetric (or centered) difference quotient given by the two-point formula
PS: If you are interested and find it useful, I have a working vanilla NumPy neural net implemented here.
I fixed it! Thanks for all the suggestions. I worked out numeric partials and found that my o and deltas were correct, but I was multiplying the wrong ones. That's why I now take numpy.outer(d[layer+1], o[layer]) instead of numpy.outer(d[layer], o[layer+1]).
I was also skipping the update on one layer. That's why I changed for layer in range(self.hidden_layers) to for layer in range(self.hidden_layers+1).
I'll add that I caught a bug just before posting originally. My output layer delta was incorrect because my net (intentionally) doesn't activate the final outputs, but my delta was computed as though it did.
Debugged primarily with a one hidden layer, one hidden unit net, then moved to a 2 input, 3 hidden layers of 2 neurons each, 2 output model.
from __future__ import division
import numpy
import scipy
import scipy.special
import matplotlib
#from pylab import *
#numpy.random.seed(23)
def nmap(f, x):
return numpy.array(map(f, x))
class Net:
def __init__(self, *sizes):
self.hidden_layers = len(sizes)-2
self.weights = [numpy.random.uniform(-1, 1, (sizes[i+1],sizes[i])) for i in range(self.hidden_layers+1)]
#staticmethod
def activate(x):
return scipy.special.expit(x)
#return 1/(1+numpy.exp(-x))
#staticmethod
def activate_(x):
s = scipy.special.expit(x)
return s*(1-s)
def y(self, x):
o = [numpy.array(x)] #o[i] is the (activated) output of hidden layer i, "hidden layer 0" is inputs and not activated
for weight in self.weights[:-1]:
o.append(Net.activate(weight.dot(o[-1])))
o.append(self.weights[-1].dot(o[-1]))
# for weight in self.weights:
# o.append(Net.activate(weight.dot(o[-1])))
return o
def __call__(self, x):
return self.y(x)[-1]
def delta(self, x, t):
x = numpy.array(x)
t = numpy.array(t)
o = self.y(x)
#delta = [(o[-1]-t) * o[-1] * (1-o[-1])]
delta = [o[-1]-t]
for i, weight in enumerate(reversed(self.weights)):
delta.append(weight.T.dot(delta[-1]) * o[-i-2] * (1-o[-i-2]))
delta.reverse() #surely i need this
return o, delta
def train(self, inputs, outputs, epochs=1000, rate=.1):
errors = []
for epoch in range(epochs):
for x, t in zip(inputs, outputs): #shuffle? subset?
o, d = self.delta(x, t)
for layer in range(self.hidden_layers+1):
grad = numpy.outer(d[layer+1], o[layer])
self.weights[layer] -= rate * grad
return errors
def rmse(self, inputs, outputs):
return ((outputs - nmap(self, inputs))**2).sum()**.5/len(inputs)
n = Net(1, 8, 1)
X = numpy.linspace(0, 2*3.1415, 10)
T = numpy.sin(X)
Y = map(n, X)
Y = numpy.array([y[0,0] for y in Y])
matplotlib.pyplot.plot(X, T, 'g')
matplotlib.pyplot.plot(X, Y, 'r')
print 'output successful'
print n.rmse(X, T)
errors = n.train(X, T)
print 'tried to train successfully'
print n.rmse(X, T)
Y = map(n, X)
Y = numpy.array([y[0,0] for y in Y])
matplotlib.pyplot.plot(x, Y, 'b')
matplotlib.pyplot.show()