Hi I am trying to build a simple autoencoder model in Theano but I also want to optimize using Scipy Optimize Minimize. I found a code here : http://dlacombejr.github.io/programming/2015/09/13/sparse-filtering-implemenation-in-theano.html which I modified a bit. The problem is since scipy minimum takes in an array, but I need to optimize more that one weight parameters. what is the solution to this problem if I want to keep using the same structure as the code in the link?
I found a possible way of getting around this problem, that is to pass in all the weights as a single array flattened and appended, and then assigning parts of the array to various variables, according to this https://github.com/Theano/Theano/issues/3131
import theano
from theano import tensor as t
import numpy as np
class SparseFilter(object):
def __init__(self, theta, dims, x):
# assign inputs to sparse filter
self.theta = theta
self.x = x
self.dims = dims
self.w = theano.shared(np.random.randn(self.dims[1], self.dims[0]), name='w')
self.w = t.reshape(self.theta[0:(self.dims[0]*self.dims[1])], self.dims)
self.b = theano.shared(np.random.randn(self.dims[1]), name='b')
self.b = t.reshape(self.theta[(self.dims[0]*self.dims[1]):self.theta.get_value().shape[0]], (self.dims[0],1))
# the feed-forward function is not fully written
def feed_forward(self):
f = t.dot(self.w, self.x.T) + self.b
return f
def get_cost_grads(self):
cost = t.sum(t.abs_(self.feed_forward()))
gradw = t.grad(cost=cost, wrt=self.w).flatten()
gradb = t.grad(cost=cost, wrt=self.b).flatten()
return cost, gradw, gradb
def training_functions(data, model, dims):
cost, grad = model.get_cost_grads()
fn = theano.function(inputs=[], outputs=[cost, grad],
givens={model.x: data}, allow_input_downcast=True)
def train_fn(theta_value):
# reshape the theta value for Theano and convert to float32
model.w = t.reshape(theta_value[0:(dims[0]*dims[1])], dims)
model.b = t.reshape(theta_value[(dims[0]*dims[1]):theta_value.shape[0]], (dims[0],1))
c, gw, gb = fn()
# convert values to float64 for SciPy
c = np.asarray(c, dtype=np.float64)
gw = np.asarray(gw, dtype=np.float64)
gb = np.asarray(gb, dtype=np.float64)
grad = np.append(gw, gb)
return c, grad
return train_fn
theta = theano.shared(np.random.randn((input_dim*hdim) + hdim), name='theta')
np.random.seed(0)
theta.set_value(np.random.randn((input_dim*hdim) + hdim).astype('float32'))
model = SparseFilter(theta, dims, x)
train_fn = training_functions(data, model, dims)
from scipy.optimize import minimize
weights = minimize(train_fn, model.theta.eval(),
method='L-BFGS-B', jac=True,
options={'maxiter': 100, 'disp': True})
There is no iteration and the oparation stops.
There is a message in the output
message: 'ABNORMAL_TERMINATION_IN_LNSRCH'
Any suggestion? Thank you.
Related
I need to create mlp function. I have picture. This function have to work as show image. I don't understand how coding relation between weights and neurons. This random weights is correct? I have not much code
import numpy as np
def ui_sigmoid(x, beta):
return 1 / (1 + np.exp(-beta*x))
def bi_sigmoid(x, beta):
return np.tanh(beta*x)
def func(inputs):
w1 = np.random.random((2, 3))
w2 = np.random.random((1, 3))
I have been trying to implement my own Linear Regression from scratch using python but have been facing a issue during the last days.
This is the code I am using :
Import modules
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
import matplotlib.pyplot as plt
Initialize parameters
def initialize_parameters(n):
w = np.zeros(n,)
b = 0.0
return w,b
Predictor/Hypothesis
def predictor(x, w, b):
return np.dot(x,w) + b
Cost function
def calculate_cost(X, y, theta, b):
m = len(y)
predictions = np.dot(X, theta)
error = predictions - y
cost = (1/2*m) * np.sum(np.power(error,2))
return cost
Gradient descent
def gradient_descent(X, W, b, y, learning_rate = 0.0001, epochs = 25):
m = len(y)
final_cost = 0
for _ in range(epochs):
predictions = predictor(X, W, b)
error = predictions - y
derivate = np.dot(error, X)
print(derivate)
W = W - (learning_rate/m) * derivate
b = b - (learning_rate/m) * error.sum()
Test run :
# Load dataset
boston = load_boston()
data = pd.DataFrame(boston.data)
data.columns = boston.feature_names
data['PRICE'] = boston.target
# Split dataset
X = data.drop(columns=['PRICE']).values
Y = data['PRICE'].values
w, b = initialize_parameters(X.shape[1])
gradient_descent(X, w, b, Y)
During the test run, I can see that the values for the derivate is growing insanely fast :
[1.41239553e+06 3.20162679e+06 3.84829686e+06 2.17737688e+04
1.81667467e+05 1.99565485e+06 2.27660208e+07 1.15045731e+06
3.50107975e+06 1.40396525e+08 5.96494458e+06 1.14447329e+08
4.25947931e+06]
[-4.33362969e+07 -9.66008831e+07 -1.16941872e+08 -6.62733008e+05
-5.50761913e+06 -6.04452389e+07 -6.90425672e+08 -3.46792848e+07
-1.06967561e+08 -4.26847914e+09 -1.80579130e+08 -3.45024565e+09
-1.29016170e+08]
...
[-2.01209195e+34 -4.47742185e+34 -5.42629282e+34 -3.07294644e+32
-2.55503032e+33 -2.80363423e+34 -3.20314565e+35 -1.60824109e+34
-4.96433806e+34 -1.98052568e+36 -8.37673498e+34 -1.60024763e+36
-5.98654489e+34]
[6.09700758e+35 1.35674093e+36 1.64426623e+36 9.31159124e+33
7.74221040e+34 8.49552585e+35 9.70611871e+36 4.87326542e+35
1.50428547e+36 6.00135600e+37 2.53830431e+36 4.84904376e+37
1.81403288e+36]
[-1.84750510e+37 -4.11117381e+37 -4.98242821e+37 -2.82158290e+35
-2.34603173e+36 -2.57430013e+37 -2.94113196e+38 -1.47668879e+37
-4.55826082e+37 -1.81852092e+39 -7.69152754e+37 -1.46934918e+39
-5.49685229e+37]
[5.59827926e+38 1.24576106e+39 1.50976712e+39 8.54991361e+36
7.10890636e+37 7.80060146e+38 8.91216919e+39 4.47463782e+38
1.38123662e+39 5.51045187e+40 2.33067389e+39 4.45239747e+40
1.66564705e+39]
[-1.69638128e+40 -3.77488445e+40 -4.57487122e+40 -2.59078061e+38
-2.15412899e+39 -2.36372529e+40 -2.70055070e+41 -1.35589732e+40
-4.18540025e+40 -1.66976797e+42 -7.06236930e+40 -1.34915808e+42
-5.04721600e+40]
And then, the gradient descent run stops before all interactions due to the high values.
At a certain point, the values form the derivate assume values as NaN.
As expected, when I try to predict a test case, I get 0.0 as output:
sample_house = [[2.29690000e-01, 0.00000000e+00, 1.05900000e+01, 0.00000000e+00, 4.89000000e-01,
6.32600000e+00, 5.25000000e+01, 4.35490000e+00, 4.00000000e+00, 2.77000000e+02,
1.86000000e+01, 3.94870000e+02, 1.09700000e+01]]
test_predict = predictor(sample_house, w, b)
test_predict
------------------------------------------------
out : array([0.])
Thanks!
Your cost function is wrong, it should be:
cost = 1/(2*m) * np.sum(np.power(error,2))
Also, try to initialize your weights as random values between 0 an 1 and scale your inputs to range 0-1.
I had the same issue which I resolved by normalizing the x values.
I think that you are making a mistake in the gradient descent algorithm. When updating the values for "W" vector it should be:
W = W - (learning_rate/m) * derivate.sum()
The learning rate is too large.
I try learning_rate = 0.000001, and it converges normally.
I am trying to define a custom theano Op with a gradient to use it with pymc3 but I don't understand how to define the grad method.
The code below is where I'm stuck. The function phi() is a mock function (in practice, it is an external program); for a scalar input x it returns a vector (phi_0(x), phi_1(x), ...). The function phi_diff() (also a mock function) returns the vector (dphi_0/dx, dphi_1/dx, ...).
I wrapped phi() and phi_diff() in a theano.Op object but my implementation of the grad function does not work. The documentation of theano contains simpler examples, I don't understand how to adapt them in this case. Any help would be greatly appreciated.
import numpy as np
import theano.tensor as T
import theano
theano.config.optimizer = "None"
theano.config.exception_verbosity = "high"
def phi(x):
return np.arange(n) * x
def phi_diff(x):
return np.arange(n)
class PhiOp(theano.Op):
itypes = [theano.tensor.dscalar]
otypes = [theano.tensor.dvector]
def perform(self, node, inputs, output_storage):
x = inputs[0]
output_storage[0][0] = phi(x)
def grad(self, inputs, output_grads):
x = inputs[0]
# ???
return [PhiDiffOp()(x) * output_grads[0]]
class PhiDiffOp(theano.Op):
itypes = [theano.tensor.dscalar]
otypes = [theano.tensor.dvector]
def perform(self, node, inputs, output_storage):
x = inputs[0]
output_storage[0][0] = phi_diff(x)
n = 5
x = 777.
phi_op = PhiOp()
x_tensor = T.dscalar("x_tensor")
phi_func = theano.function([x_tensor], phi_op(x_tensor))
np.testing.assert_allclose(phi_func(x), phi(x))
T.jacobian(phi_op(x_tensor), x_tensor)
Found the solution, changes below:
def phi_diff(x):
return np.arange(n, dtype=np.float_)
class PhiOp(theano.Op):
def grad(self, inputs, output_grads):
x = inputs[0]
gg = (PhiDiffOp()(x) * output_grads[0]).sum()
return [gg]
How it can be that it works
g_W = T.grad(cost=cost, wrt=classifier.vparamW)
whereas this
H_W=T.hessian(cost=cost, wrt=classifier.vparamW)
gives NotImplementedError()
may it be that the problem in such cost function:
-T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
Here y is the vector of class labels from 0 to n-1 and
self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
I am unable to reproduce this problem with the limited code that has been provided. However, here is a fully working demo of T.grad and T.hessian.
import numpy
import theano
import theano.tensor as T
x = T.matrix()
w_flat = theano.shared(numpy.random.randn(3, 2).astype(theano.config.floatX).flatten())
w = w_flat.reshape((3, 2))
cost = T.pow(theano.dot(x, w), 2).sum()
g_w = T.grad(cost=cost, wrt=[w])
h_w = T.hessian(cost=cost, wrt=[w_flat])
f = theano.function([x], outputs=g_w + h_w)
for output in f(numpy.random.randn(4, 3).astype(theano.config.floatX)):
print output.shape, '\n', output
Note that the wrt value for T.hessian needs to be a vector.
Darn thing just won't learn. Sometimes weights seem to become nan.
I haven't played with different numbers of hidden layers/inputs/outputs but the bug appears consistent across different sizes of hidden layer.
from __future__ import division
import numpy
import matplotlib
import random
class Net:
def __init__(self, *sizes):
sizes = list(sizes)
sizes[0] += 1
self.sizes = sizes
self.weights = [numpy.random.uniform(-1, 1, (sizes[i+1],sizes[i])) for i in range(len(sizes)-1)]
#staticmethod
def activate(x):
return 1/(1+numpy.exp(-x))
def y(self, x_):
x = numpy.concatenate(([1], numpy.atleast_1d(x_.copy())))
o = [x] #o[i] is the (activated) output of hidden layer i, "hidden layer 0" is inputs
for weight in self.weights[:-1]:
x = weight.dot(x)
x = Net.activate(x)
o.append(x)
o.append(self.weights[-1].dot(x))
return o
def __call__(self, x):
return self.y(x)[-1]
def delta(self, x, t):
o = self.y(x)
delta = [(o[-1]-t) * o[-1] * (1-o[-1])]
for i, weight in enumerate(reversed(self.weights)):
delta.append(weight.T.dot(delta[-1]) * o[-i-2] * (1-o[-i-2]))
delta.reverse()
return o, delta
def train(self, inputs, outputs, epochs=100, rate=.1):
for epoch in range(epochs):
pairs = zip(inputs, outputs)
random.shuffle(pairs)
for x, t in pairs: #shuffle? subset?
o, d = self.delta(x, t)
for layer in range(len(self.sizes)-1):
self.weights[layer] -= rate * numpy.outer(o[layer+1], d[layer])
n = Net(1, 4, 1)
x = numpy.linspace(0, 2*3.14, 10)
t = numpy.sin(x)
matplotlib.pyplot.plot(x, t, 'g')
matplotlib.pyplot.plot(x, map(n, x), 'r')
n.train(x, t)
print n.weights
matplotlib.pyplot.plot(x, map(n, x), 'b')
matplotlib.pyplot.show()
I haven't looked for a particular bug in your code, but can you please try the following things to narrow down your problem further? Otherwise it is very tedious to find the needle in the haystack.
1) Please try to use a real dataset to have an idea what to expect, e.g., MNIST, and/or standardize your data, because your weights may become NaN if they become too small.
2) Try different learning rates and plot the cost function vs. epochs to check if you are converging. It should look somewhat like this (note that I used minibatch learning and averaged the minibatch chunks for each epoch).
3) I see that you are using a sigmoid activation, your implementation is correct, but to make it numerically more stable, replace 1.0 / (1.0 + np.exp(-z)) by expit(z) from scipy.special (same function but more efficient).
4) Implement gradient checking. Here, you compare the analytical solution to a numerically approximated gradient
Or an even better approach that yields a more accurate approximation of the gradient is to compute the symmetric (or centered) difference quotient given by the two-point formula
PS: If you are interested and find it useful, I have a working vanilla NumPy neural net implemented here.
I fixed it! Thanks for all the suggestions. I worked out numeric partials and found that my o and deltas were correct, but I was multiplying the wrong ones. That's why I now take numpy.outer(d[layer+1], o[layer]) instead of numpy.outer(d[layer], o[layer+1]).
I was also skipping the update on one layer. That's why I changed for layer in range(self.hidden_layers) to for layer in range(self.hidden_layers+1).
I'll add that I caught a bug just before posting originally. My output layer delta was incorrect because my net (intentionally) doesn't activate the final outputs, but my delta was computed as though it did.
Debugged primarily with a one hidden layer, one hidden unit net, then moved to a 2 input, 3 hidden layers of 2 neurons each, 2 output model.
from __future__ import division
import numpy
import scipy
import scipy.special
import matplotlib
#from pylab import *
#numpy.random.seed(23)
def nmap(f, x):
return numpy.array(map(f, x))
class Net:
def __init__(self, *sizes):
self.hidden_layers = len(sizes)-2
self.weights = [numpy.random.uniform(-1, 1, (sizes[i+1],sizes[i])) for i in range(self.hidden_layers+1)]
#staticmethod
def activate(x):
return scipy.special.expit(x)
#return 1/(1+numpy.exp(-x))
#staticmethod
def activate_(x):
s = scipy.special.expit(x)
return s*(1-s)
def y(self, x):
o = [numpy.array(x)] #o[i] is the (activated) output of hidden layer i, "hidden layer 0" is inputs and not activated
for weight in self.weights[:-1]:
o.append(Net.activate(weight.dot(o[-1])))
o.append(self.weights[-1].dot(o[-1]))
# for weight in self.weights:
# o.append(Net.activate(weight.dot(o[-1])))
return o
def __call__(self, x):
return self.y(x)[-1]
def delta(self, x, t):
x = numpy.array(x)
t = numpy.array(t)
o = self.y(x)
#delta = [(o[-1]-t) * o[-1] * (1-o[-1])]
delta = [o[-1]-t]
for i, weight in enumerate(reversed(self.weights)):
delta.append(weight.T.dot(delta[-1]) * o[-i-2] * (1-o[-i-2]))
delta.reverse() #surely i need this
return o, delta
def train(self, inputs, outputs, epochs=1000, rate=.1):
errors = []
for epoch in range(epochs):
for x, t in zip(inputs, outputs): #shuffle? subset?
o, d = self.delta(x, t)
for layer in range(self.hidden_layers+1):
grad = numpy.outer(d[layer+1], o[layer])
self.weights[layer] -= rate * grad
return errors
def rmse(self, inputs, outputs):
return ((outputs - nmap(self, inputs))**2).sum()**.5/len(inputs)
n = Net(1, 8, 1)
X = numpy.linspace(0, 2*3.1415, 10)
T = numpy.sin(X)
Y = map(n, X)
Y = numpy.array([y[0,0] for y in Y])
matplotlib.pyplot.plot(X, T, 'g')
matplotlib.pyplot.plot(X, Y, 'r')
print 'output successful'
print n.rmse(X, T)
errors = n.train(X, T)
print 'tried to train successfully'
print n.rmse(X, T)
Y = map(n, X)
Y = numpy.array([y[0,0] for y in Y])
matplotlib.pyplot.plot(x, Y, 'b')
matplotlib.pyplot.show()